Okular

textpage.cpp
1 /*
2  SPDX-FileCopyrightText: 2005 Piotr Szymanski <[email protected]>
3 
4  SPDX-License-Identifier: GPL-2.0-or-later
5 */
6 
7 #include "textpage.h"
8 #include "textpage_p.h"
9 
10 #include <QDebug>
11 
12 #include "area.h"
13 #include "debug_p.h"
14 #include "misc.h"
15 #include "page.h"
16 #include "page_p.h"
17 
18 #include <cstring>
19 
20 #include <QVarLengthArray>
21 #include <QtAlgorithms>
22 
23 using namespace Okular;
24 
25 class SearchPoint
26 {
27 public:
28  SearchPoint()
29  : offset_begin(-1)
30  , offset_end(-1)
31  {
32  }
33 
34  /** The TinyTextEntity containing the first character of the match. */
35  TextList::ConstIterator it_begin;
36 
37  /** The TinyTextEntity containing the last character of the match. */
38  TextList::ConstIterator it_end;
39 
40  /** The index of the first character of the match in (*it_begin)->text().
41  * Satisfies 0 <= offset_begin < (*it_begin)->text().length().
42  */
43  int offset_begin;
44 
45  /** One plus the index of the last character of the match in (*it_end)->text().
46  * Satisfies 0 < offset_end <= (*it_end)->text().length().
47  */
48  int offset_end;
49 };
50 
51 /* text comparison functions */
52 
53 static bool CaseInsensitiveCmpFn(const QStringRef &from, const QStringRef &to)
54 {
55 #ifdef DEBUG_TEXTPAGE
56  qDebug(OkularCoreDebug) << from << ":" << to << "(case insensitive)";
57 #endif
58  return from.compare(to, Qt::CaseInsensitive) == 0;
59 }
60 
61 static bool CaseSensitiveCmpFn(const QStringRef &from, const QStringRef &to)
62 {
63 #ifdef DEBUG_TEXTPAGE
64  qDebug(OkularCoreDebug) << from << ":" << to << "(case sensitive)";
65 #endif
66  return from.compare(to, Qt::CaseSensitive) == 0;
67 }
68 
69 /**
70  * Returns true iff segments [@p left1, @p right1] and [@p left2, @p right2] on the real line
71  * overlap within @p threshold percent, i. e. iff the ratio of the length of the
72  * intersection of the segments to the length of the shortest of the two input segments
73  * is not smaller than the threshold.
74  */
75 static bool segmentsOverlap(double left1, double right1, double left2, double right2, int threshold)
76 {
77  // check if one consumes another fully (speed optimization)
78 
79  if (left1 <= left2 && right1 >= right2) {
80  return true;
81  }
82 
83  if (left1 >= left2 && right1 <= right2) {
84  return true;
85  }
86 
87  // check if there is overlap above threshold
88  if (right2 >= left1 && right1 >= left2) {
89  double overlap = (right2 >= right1) ? right1 - left2 : right2 - left1;
90 
91  double length1 = right1 - left1, length2 = right2 - left2;
92 
93  return overlap * 100 >= threshold * qMin(length1, length2);
94  }
95 
96  return false;
97 }
98 
99 static bool doesConsumeY(const QRect first, const QRect second, int threshold)
100 {
101  return segmentsOverlap(first.top(), first.bottom(), second.top(), second.bottom(), threshold);
102 }
103 
104 static bool doesConsumeY(const NormalizedRect &first, const NormalizedRect &second, int threshold)
105 {
106  return segmentsOverlap(first.top, first.bottom, second.top, second.bottom, threshold);
107 }
108 
109 /*
110  Rationale behind TinyTextEntity:
111 
112  instead of storing directly a QString for the text of an entity,
113  we store the UTF-16 data and their length. This way, we save about
114  4 int's wrt a QString, and we can create a new string from that
115  raw data (that's the only penalty of that).
116  Even better, if the string we need to store has at most
117  MaxStaticChars characters, then we store those in place of the QChar*
118  that would be used (with new[] + free[]) for the data.
119  */
120 class TinyTextEntity
121 {
122  static const int MaxStaticChars = sizeof(void *) / sizeof(QChar);
123 
124 public:
125  TinyTextEntity(const QString &text, const NormalizedRect &rect)
126  : area(rect)
127  {
128  Q_ASSERT_X(!text.isEmpty(), "TinyTextEntity", "empty string");
129  Q_ASSERT_X(sizeof(d) == sizeof(void *), "TinyTextEntity", "internal storage is wider than QChar*, fix it!");
130  length = text.length();
131  switch (length) {
132 #if QT_POINTER_SIZE >= 8
133  case 4:
134  d.qc[3] = text.at(3).unicode();
135  // fall through
136  case 3:
137  d.qc[2] = text.at(2).unicode();
138 #endif
139  // fall through
140  case 2:
141  d.qc[1] = text.at(1).unicode();
142  // fall through
143  case 1:
144  d.qc[0] = text.at(0).unicode();
145  break;
146  default:
147  d.data = new QChar[length];
148  std::memcpy(d.data, text.constData(), length * sizeof(QChar));
149  }
150  }
151 
152  ~TinyTextEntity()
153  {
154  if (length > MaxStaticChars) {
155  delete[] d.data;
156  }
157  }
158 
159  inline QString text() const
160  {
161  return length <= MaxStaticChars ? QString::fromRawData((const QChar *)&d.qc[0], length) : QString::fromRawData(d.data, length);
162  }
163 
164  inline NormalizedRect transformedArea(const QTransform &matrix) const
165  {
166  NormalizedRect transformed_area = area;
167  transformed_area.transform(matrix);
168  return transformed_area;
169  }
170 
171  NormalizedRect area;
172 
173 private:
174  Q_DISABLE_COPY(TinyTextEntity)
175 
176  union {
177  QChar *data;
178  ushort qc[MaxStaticChars];
179  } d;
180  int length;
181 };
182 
184  : m_text(text)
185  , m_area(area)
186  , d(nullptr)
187 {
188 }
189 
191 {
192  delete m_area;
193 }
194 
196 {
197  return m_text;
198 }
199 
201 {
202  return m_area;
203 }
204 
206 {
207  NormalizedRect transformed_area = *m_area;
208  transformed_area.transform(matrix);
209  return transformed_area;
210 }
211 
212 TextPagePrivate::TextPagePrivate()
213  : m_page(nullptr)
214 {
215 }
216 
217 TextPagePrivate::~TextPagePrivate()
218 {
219  qDeleteAll(m_searchPoints);
220  qDeleteAll(m_words);
221 }
222 
224  : d(new TextPagePrivate())
225 {
226 }
227 
229  : d(new TextPagePrivate())
230 {
232  for (; it != itEnd; ++it) {
233  TextEntity *e = *it;
234  if (!e->text().isEmpty()) {
235  d->m_words.append(new TinyTextEntity(e->text(), *e->area()));
236  }
237  delete e;
238  }
239 }
240 
242 {
243  delete d;
244 }
245 
246 void TextPage::append(const QString &text, NormalizedRect *area)
247 {
248  if (!text.isEmpty()) {
249  if (!d->m_words.isEmpty()) {
250  TinyTextEntity *lastEntity = d->m_words.last();
251  const QString concatText = lastEntity->text() + text.normalized(QString::NormalizationForm_KC);
252  if (concatText != concatText.normalized(QString::NormalizationForm_KC)) {
253  // If this happens it means that the new text + old one have combined, for example A and ◌̊ form Å
254  NormalizedRect newArea = *area | lastEntity->area;
255  delete area;
256  delete lastEntity;
257  d->m_words.removeLast();
258  d->m_words.append(new TinyTextEntity(concatText.normalized(QString::NormalizationForm_KC), newArea));
259  return;
260  }
261  }
262 
263  d->m_words.append(new TinyTextEntity(text.normalized(QString::NormalizationForm_KC), *area));
264  }
265  delete area;
266 }
267 
268 struct WordWithCharacters {
269  WordWithCharacters(TinyTextEntity *w, const TextList &c)
270  : word(w)
271  , characters(c)
272  {
273  }
274 
275  inline QString text() const
276  {
277  return word->text();
278  }
279 
280  inline const NormalizedRect &area() const
281  {
282  return word->area;
283  }
284 
285  TinyTextEntity *word;
286  TextList characters;
287 };
289 
290 /**
291  * We will divide the whole page in some regions depending on the horizontal and
292  * vertical spacing among different regions. Each region will have an area and an
293  * associated WordsWithCharacters in sorted order.
294  */
295 class RegionText
296 {
297 public:
298  RegionText() {};
299 
300  RegionText(const WordsWithCharacters &wordsWithCharacters, const QRect area)
301  : m_region_wordWithCharacters(wordsWithCharacters)
302  , m_area(area)
303  {
304  }
305 
306  inline QString string() const
307  {
308  QString res;
309  for (const WordWithCharacters &word : m_region_wordWithCharacters) {
310  res += word.text();
311  }
312  return res;
313  }
314 
315  inline WordsWithCharacters text() const
316  {
317  return m_region_wordWithCharacters;
318  }
319 
320  inline QRect area() const
321  {
322  return m_area;
323  }
324 
325  inline void setArea(const QRect area)
326  {
327  m_area = area;
328  }
329 
330  inline void setText(const WordsWithCharacters &wordsWithCharacters)
331  {
332  m_region_wordWithCharacters = wordsWithCharacters;
333  }
334 
335 private:
336  WordsWithCharacters m_region_wordWithCharacters;
337  QRect m_area;
338 };
339 
341 {
342  if (d->m_words.isEmpty()) {
343  return new RegularAreaRect();
344  }
345 
346  /**
347  It works like this:
348  There are two cursors, we need to select all the text between them. The coordinates are normalised, leftTop is (0,0)
349  rightBottom is (1,1), so for cursors start (sx,sy) and end (ex,ey) we start with finding text rectangles under those
350  points, if not we search for the first that is to the right to it in the same baseline, if none found, then we search
351  for the first rectangle with a baseline under the cursor, having two points that are the best rectangles to both
352  of the cursors: (rx,ry)x(tx,ty) for start and (ux,uy)x(vx,vy) for end, we do a
353  1. (rx,ry)x(1,ty)
354  2. (0,ty)x(1,uy)
355  3. (0,uy)x(vx,vy)
356 
357  To find the closest rectangle to cursor (cx,cy) we search for a rectangle that either contains the cursor
358  or that has a left border >= cx and bottom border >= cy.
359  */
360  RegularAreaRect *ret = new RegularAreaRect;
361 
362  PagePrivate *pagePrivate = PagePrivate::get(d->m_page);
363  const QTransform matrix = pagePrivate ? pagePrivate->rotationMatrix() : QTransform();
364  const double scaleX = d->m_page->width();
365  const double scaleY = d->m_page->height();
366 
367  NormalizedPoint startC = sel->start();
368  NormalizedPoint endC = sel->end();
369  NormalizedPoint temp;
370 
371  // if startPoint is right to endPoint swap them
372  if (startC.x > endC.x) {
373  temp = startC;
374  startC = endC;
375  endC = temp;
376  }
377 
378  // minX,maxX,minY,maxY gives the bounding rectangle coordinates of the document
379  const NormalizedRect boundingRect = d->m_page->boundingBox();
380  const QRect content = boundingRect.geometry(scaleX, scaleY);
381  const double minX = content.left();
382  const double maxX = content.right();
383  const double minY = content.top();
384  const double maxY = content.bottom();
385 
386  /**
387  * We will now find out the TinyTextEntity for the startRectangle and TinyTextEntity for
388  * the endRectangle. We have four cases:
389  *
390  * Case 1(a): both startpoint and endpoint are out of the bounding Rectangle and at one side, so the rectangle made of start
391  * and endPoint are outof the bounding rect (do not intersect)
392  *
393  * Case 1(b): both startpoint and endpoint are out of bounding rect, but they are in different side, so is their rectangle
394  *
395  * Case 2(a): find the rectangle which contains start and endpoint and having some
396  * TextEntity
397  *
398  * Case 2(b): if 2(a) fails (if startPoint and endPoint both are unchanged), then we check whether there is any
399  * TextEntity within the rect made by startPoint and endPoint
400  *
401  * Case 3: Now, we may have two type of selection.
402  * 1. startpoint is left-top of start_end and endpoint is right-bottom
403  * 2. startpoint is left-bottom of start_end and endpoint is top-right
404  *
405  * Also, as 2(b) is passed, we might have it,itEnd or both unchanged, but the fact is that we have
406  * text within them. so, we need to search for the best suitable textposition for start and end.
407  *
408  * Case 3(a): We search the nearest rectangle consisting of some
409  * TinyTextEntity right to or bottom of the startPoint for selection 01.
410  * And, for selection 02, we have to search for right and top
411  *
412  * Case 3(b): For endpoint, we have to find the point top of or left to
413  * endpoint if we have selection 01.
414  * Otherwise, the search will be left and bottom
415  */
416 
417  // we know that startC.x > endC.x, we need to decide which is top and which is bottom
418  const NormalizedRect start_end = (startC.y < endC.y) ? NormalizedRect(startC.x, startC.y, endC.x, endC.y) : NormalizedRect(startC.x, endC.y, endC.x, startC.y);
419 
420  // Case 1(a)
421  if (!boundingRect.intersects(start_end)) {
422  return ret;
423  } else {
424  // case 1(b)
425  /**
426  note that, after swapping of start and end, we know that,
427  start is always left to end. but, we cannot say start is
428  positioned upper than end.
429  **/
430  // if start is left to content rect take it to content rect boundary
431  if (startC.x * scaleX < minX) {
432  startC.x = minX / scaleX;
433  }
434  if (endC.x * scaleX > maxX) {
435  endC.x = maxX / scaleX;
436  }
437 
438  // if start is top to end (selection type 01)
439  if (startC.y * scaleY < minY) {
440  startC.y = minY / scaleY;
441  }
442  if (endC.y * scaleY > maxY) {
443  endC.y = maxY / scaleY;
444  }
445 
446  // if start is bottom to end (selection type 02)
447  if (startC.y * scaleY > maxY) {
448  startC.y = maxY / scaleY;
449  }
450  if (endC.y * scaleY < minY) {
451  endC.y = minY / scaleY;
452  }
453  }
454 
455  TextList::ConstIterator it = d->m_words.constBegin(), itEnd = d->m_words.constEnd();
456  TextList::ConstIterator start = it, end = itEnd, tmpIt = it; //, tmpItEnd = itEnd;
457  const MergeSide side = d->m_page ? (MergeSide)d->m_page->totalOrientation() : MergeRight;
458 
459  NormalizedRect tmp;
460  // case 2(a)
461  for (; it != itEnd; ++it) {
462  tmp = (*it)->area;
463  if (tmp.contains(startC.x, startC.y)) {
464  start = it;
465  }
466  if (tmp.contains(endC.x, endC.y)) {
467  end = it;
468  }
469  }
470 
471  // case 2(b)
472  it = tmpIt;
473  if (start == it && end == itEnd) {
474  for (; it != itEnd; ++it) {
475  // is there any text rectangle within the start_end rect
476  tmp = (*it)->area;
477  if (start_end.intersects(tmp)) {
478  break;
479  }
480  }
481 
482  // we have searched every text entities, but none is within the rectangle created by start and end
483  // so, no selection should be done
484  if (it == itEnd) {
485  return ret;
486  }
487  }
488  it = tmpIt;
489  bool selection_two_start = false;
490 
491  // case 3.a
492  if (start == it) {
493  NormalizedRect rect;
494 
495  // selection type 01
496  if (startC.y <= endC.y) {
497  for (; it != itEnd; ++it) {
498  rect = (*it)->area;
499  bool flagV = !rect.isBottom(startC);
500 
501  if (flagV && rect.isRight(startC)) {
502  start = it;
503  break;
504  }
505  }
506  }
507 
508  // selection type 02
509  else {
510  selection_two_start = true;
511  int distance = scaleX + scaleY + 100;
512  int count = 0;
513 
514  for (; it != itEnd; ++it) {
515  rect = (*it)->area;
516 
517  if (rect.isBottomOrLevel(startC) && rect.isRight(startC)) {
518  count++;
519  QRect entRect = rect.geometry(scaleX, scaleY);
520  int xdist, ydist;
521  xdist = entRect.center().x() - startC.x * scaleX;
522  ydist = entRect.center().y() - startC.y * scaleY;
523 
524  // make them positive
525  if (xdist < 0) {
526  xdist = -xdist;
527  }
528  if (ydist < 0) {
529  ydist = -ydist;
530  }
531 
532  if ((xdist + ydist) < distance) {
533  distance = xdist + ydist;
534  start = it;
535  }
536  }
537  }
538  }
539  }
540 
541  // case 3.b
542  if (end == itEnd) {
543  it = tmpIt;
544  itEnd = itEnd - 1;
545 
546  NormalizedRect rect;
547 
548  if (startC.y <= endC.y) {
549  for (; itEnd >= it; itEnd--) {
550  rect = (*itEnd)->area;
551  bool flagV = !rect.isTop(endC);
552 
553  if (flagV && rect.isLeft(endC)) {
554  end = itEnd;
555  break;
556  }
557  }
558  }
559 
560  else {
561  int distance = scaleX + scaleY + 100;
562  for (; itEnd >= it; itEnd--) {
563  rect = (*itEnd)->area;
564 
565  if (rect.isTopOrLevel(endC) && rect.isLeft(endC)) {
566  QRect entRect = rect.geometry(scaleX, scaleY);
567  int xdist, ydist;
568  xdist = entRect.center().x() - endC.x * scaleX;
569  ydist = entRect.center().y() - endC.y * scaleY;
570 
571  // make them positive
572  if (xdist < 0) {
573  xdist = -xdist;
574  }
575  if (ydist < 0) {
576  ydist = -ydist;
577  }
578 
579  if ((xdist + ydist) < distance) {
580  distance = xdist + ydist;
581  end = itEnd;
582  }
583  }
584  }
585  }
586  }
587 
588  /* if start and end in selection 02 are in the same column, and we
589  start at an empty space we have to remove the selection of last
590  character
591  */
592  if (selection_two_start) {
593  if (start > end) {
594  start = start - 1;
595  }
596  }
597 
598  // if start is less than end swap them
599  if (start > end) {
600  it = start;
601  start = end;
602  end = it;
603  }
604 
605  // removes the possibility of crash, in case none of 1 to 3 is true
606  if (end == d->m_words.constEnd()) {
607  end--;
608  }
609 
610  for (; start <= end; start++) {
611  ret->appendShape((*start)->transformedArea(matrix), side);
612  }
613 
614  return ret;
615 }
616 
617 RegularAreaRect *TextPage::findText(int searchID, const QString &query, SearchDirection direct, Qt::CaseSensitivity caseSensitivity, const RegularAreaRect *area)
618 {
619  SearchDirection dir = direct;
620  // invalid search request
621  if (d->m_words.isEmpty() || query.isEmpty() || (area && area->isNull())) {
622  return nullptr;
623  }
624  TextList::ConstIterator start;
625  int start_offset = 0;
626  TextList::ConstIterator end;
627  const QMap<int, SearchPoint *>::const_iterator sIt = d->m_searchPoints.constFind(searchID);
628  if (sIt == d->m_searchPoints.constEnd()) {
629  // if no previous run of this search is found, then set it to start
630  // from the beginning (respecting the search direction)
631  if (dir == NextResult) {
632  dir = FromTop;
633  } else if (dir == PreviousResult) {
634  dir = FromBottom;
635  }
636  }
637  bool forward = true;
638  switch (dir) {
639  case FromTop:
640  start = d->m_words.constBegin();
641  start_offset = 0;
642  end = d->m_words.constEnd();
643  break;
644  case FromBottom:
645  start = d->m_words.constEnd();
646  start_offset = 0;
647  end = d->m_words.constBegin();
648  forward = false;
649  break;
650  case NextResult:
651  start = (*sIt)->it_end;
652  start_offset = (*sIt)->offset_end;
653  end = d->m_words.constEnd();
654  break;
655  case PreviousResult:
656  start = (*sIt)->it_begin;
657  start_offset = (*sIt)->offset_begin;
658  end = d->m_words.constBegin();
659  forward = false;
660  break;
661  };
662  RegularAreaRect *ret = nullptr;
663  const TextComparisonFunction cmpFn = caseSensitivity == Qt::CaseSensitive ? CaseSensitiveCmpFn : CaseInsensitiveCmpFn;
664  if (forward) {
665  ret = d->findTextInternalForward(searchID, query, cmpFn, start, start_offset, end);
666  } else {
667  ret = d->findTextInternalBackward(searchID, query, cmpFn, start, start_offset, end);
668  }
669  return ret;
670 }
671 
672 // hyphenated '-' must be at the end of a word, so hyphenation means
673 // we have a '-' just followed by a '\n' character
674 // check if the string contains a '-' character
675 // if the '-' is the last entry
676 static int stringLengthAdaptedWithHyphen(const QString &str, const TextList::ConstIterator &it, const TextList::ConstIterator &textListEnd)
677 {
678  const int len = str.length();
679 
680  // hyphenated '-' must be at the end of a word, so hyphenation means
681  // we have a '-' just followed by a '\n' character
682  // check if the string contains a '-' character
683  // if the '-' is the last entry
684  if (str.endsWith(QLatin1Char('-'))) {
685  // validity chek of it + 1
686  if ((it + 1) != textListEnd) {
687  // 1. if the next character is '\n'
688  const QString &lookahedStr = (*(it + 1))->text();
689  if (lookahedStr.startsWith(QLatin1Char('\n'))) {
690  return len - 1;
691  }
692 
693  // 2. if the next word is in a different line or not
694  const NormalizedRect &hyphenArea = (*it)->area;
695  const NormalizedRect &lookaheadArea = (*(it + 1))->area;
696 
697  // lookahead to check whether both the '-' rect and next character rect overlap
698  if (!doesConsumeY(hyphenArea, lookaheadArea, 70)) {
699  return len - 1;
700  }
701  }
702  }
703  // else if it is the second last entry - for example in pdf format
704  else if (str.endsWith(QLatin1String("-\n"))) {
705  return len - 2;
706  }
707 
708  return len;
709 }
710 
711 RegularAreaRect *TextPagePrivate::searchPointToArea(const SearchPoint *sp)
712 {
713  PagePrivate *pagePrivate = PagePrivate::get(m_page);
714  const QTransform matrix = pagePrivate ? pagePrivate->rotationMatrix() : QTransform();
715  RegularAreaRect *ret = new RegularAreaRect;
716 
717  for (TextList::ConstIterator it = sp->it_begin;; it++) {
718  const TinyTextEntity *curEntity = *it;
719  ret->append(curEntity->transformedArea(matrix));
720 
721  if (it == sp->it_end) {
722  break;
723  }
724  }
725 
726  ret->simplify();
727  return ret;
728 }
729 
730 RegularAreaRect *TextPagePrivate::findTextInternalForward(int searchID, const QString &_query, TextComparisonFunction comparer, const TextList::ConstIterator &start, int start_offset, const TextList::ConstIterator &end)
731 {
732  // normalize query search all unicode (including glyphs)
734 
735  // j is the current position in our query
736  // queryLeft is the length of the query we have left to match
737  int j = 0, queryLeft = query.length();
738 
739  TextList::ConstIterator it = start;
740  int offset = start_offset;
741 
742  TextList::ConstIterator it_begin = TextList::ConstIterator();
743  int offset_begin = 0; // dummy initial value to suppress compiler warnings
744 
745  while (it != end) {
746  const TinyTextEntity *curEntity = *it;
747  const QString &str = curEntity->text();
748  const int strLen = str.length();
749  const int adjustedLen = stringLengthAdaptedWithHyphen(str, it, m_words.constEnd());
750  // adjustedLen <= strLen
751 
752  if (offset >= strLen) {
753  it++;
754  offset = 0;
755  continue;
756  }
757 
758  if (it_begin == TextList::ConstIterator()) {
759  it_begin = it;
760  offset_begin = offset;
761  }
762 
763  // Let the user write the hyphen or not when searching for text
764  int matchedLen = -1;
765  for (int matchingLen = strLen; matchingLen >= adjustedLen; matchingLen--) {
766  // we have equal (or less than) area of the query left as the length of the current
767  // entity
768  const int min = qMin(queryLeft, matchingLen - offset);
769  if (comparer(str.midRef(offset, min), query.midRef(j, min))) {
770  matchedLen = min;
771  break;
772  }
773  }
774 
775  if (matchedLen == -1) {
776  // we have not matched
777  // this means we do not have a complete match
778  // we need to get back to query start
779  // and continue the search from this place
780 #ifdef DEBUG_TEXTPAGE
781  qCDebug(OkularCoreDebug) << "\tnot matched";
782 #endif
783  j = 0;
784  queryLeft = query.length();
785  it = it_begin;
786  offset = offset_begin + 1;
787  it_begin = TextList::ConstIterator();
788  } else {
789  // we have a match
790  // move the current position in the query
791  // to the position after the length of this string
792  // we matched
793  // subtract the length of the current entity from
794  // the left length of the query
795 #ifdef DEBUG_TEXTPAGE
796  qCDebug(OkularCoreDebug) << "\tmatched" << matchedLen;
797 #endif
798  j += matchedLen;
799  queryLeft -= matchedLen;
800 
801  if (queryLeft == 0) {
802  // save or update the search point for the current searchID
803  QMap<int, SearchPoint *>::iterator sIt = m_searchPoints.find(searchID);
804  if (sIt == m_searchPoints.end()) {
805  sIt = m_searchPoints.insert(searchID, new SearchPoint);
806  }
807  SearchPoint *sp = *sIt;
808  sp->it_begin = it_begin;
809  sp->it_end = it;
810  sp->offset_begin = offset_begin;
811  sp->offset_end = offset + matchedLen;
812  return searchPointToArea(sp);
813  }
814 
815  it++;
816  offset = 0;
817  }
818  }
819  // end of loop - it means that we've ended the textentities
820 
821  const QMap<int, SearchPoint *>::iterator sIt = m_searchPoints.find(searchID);
822  if (sIt != m_searchPoints.end()) {
823  SearchPoint *sp = *sIt;
824  m_searchPoints.erase(sIt);
825  delete sp;
826  }
827  return nullptr;
828 }
829 
830 RegularAreaRect *TextPagePrivate::findTextInternalBackward(int searchID, const QString &_query, TextComparisonFunction comparer, const TextList::ConstIterator &start, int start_offset, const TextList::ConstIterator &end)
831 {
832  // normalize query to search all unicode (including glyphs)
834 
835  // j is the current position in our query
836  // len is the length of the string in TextEntity
837  // queryLeft is the length of the query we have left
838  int j = query.length(), queryLeft = query.length();
839 
840  TextList::ConstIterator it = start;
841  int offset = start_offset;
842 
843  TextList::ConstIterator it_begin = TextList::ConstIterator();
844  int offset_begin = 0; // dummy initial value to suppress compiler warnings
845 
846  while (true) {
847  if (offset <= 0) {
848  if (it == end) {
849  break;
850  }
851  it--;
852  }
853 
854  const TinyTextEntity *curEntity = *it;
855  const QString &str = curEntity->text();
856  const int strLen = str.length();
857  const int adjustedLen = stringLengthAdaptedWithHyphen(str, it, m_words.constEnd());
858  // adjustedLen <= strLen
859 
860  if (offset <= 0) {
861  offset = strLen;
862  }
863 
864  if (it_begin == TextList::ConstIterator()) {
865  it_begin = it;
866  offset_begin = offset;
867  }
868 
869  // Let the user write the hyphen or not when searching for text
870  int matchedLen = -1;
871  // we have equal (or less than) area of the query left as the length of the current
872  // entity
873  for (int matchingLen = strLen; matchingLen >= adjustedLen; matchingLen--) {
874  const int hyphenOffset = (strLen - matchingLen);
875  const int min = qMin(queryLeft + hyphenOffset, offset);
876  if (comparer(str.midRef(offset - min, min - hyphenOffset), query.midRef(j - min + hyphenOffset, min - hyphenOffset))) {
877  matchedLen = min - hyphenOffset;
878  break;
879  }
880  }
881 
882  if (matchedLen == -1) {
883  // we have not matched
884  // this means we do not have a complete match
885  // we need to get back to query start
886  // and continue the search from this place
887 #ifdef DEBUG_TEXTPAGE
888  qCDebug(OkularCoreDebug) << "\tnot matched";
889 #endif
890 
891  j = query.length();
892  queryLeft = query.length();
893  it = it_begin;
894  offset = offset_begin - 1;
895  it_begin = TextList::ConstIterator();
896  } else {
897  // we have a match
898  // move the current position in the query
899  // to the position after the length of this string
900  // we matched
901  // subtract the length of the current entity from
902  // the left length of the query
903 #ifdef DEBUG_TEXTPAGE
904  qCDebug(OkularCoreDebug) << "\tmatched";
905 #endif
906  j -= matchedLen;
907  queryLeft -= matchedLen;
908 
909  if (queryLeft == 0) {
910  // save or update the search point for the current searchID
911  QMap<int, SearchPoint *>::iterator sIt = m_searchPoints.find(searchID);
912  if (sIt == m_searchPoints.end()) {
913  sIt = m_searchPoints.insert(searchID, new SearchPoint);
914  }
915  SearchPoint *sp = *sIt;
916  sp->it_begin = it;
917  sp->it_end = it_begin;
918  sp->offset_begin = offset - matchedLen;
919  sp->offset_end = offset_begin;
920  return searchPointToArea(sp);
921  }
922 
923  offset = 0;
924  }
925  }
926  // end of loop - it means that we've ended the textentities
927 
928  const QMap<int, SearchPoint *>::iterator sIt = m_searchPoints.find(searchID);
929  if (sIt != m_searchPoints.end()) {
930  SearchPoint *sp = *sIt;
931  m_searchPoints.erase(sIt);
932  delete sp;
933  }
934  return nullptr;
935 }
936 
938 {
940 }
941 
943 {
944  if (area && area->isNull()) {
945  return QString();
946  }
947 
948  TextList::ConstIterator it = d->m_words.constBegin(), itEnd = d->m_words.constEnd();
949  QString ret;
950  if (area) {
951  for (; it != itEnd; ++it) {
953  if (area->intersects((*it)->area)) {
954  ret += (*it)->text();
955  }
956  } else {
957  NormalizedPoint center = (*it)->area.center();
958  if (area->contains(center.x, center.y)) {
959  ret += (*it)->text();
960  }
961  }
962  }
963  } else {
964  for (; it != itEnd; ++it) {
965  ret += (*it)->text();
966  }
967  }
968  return ret;
969 }
970 
971 static bool compareTinyTextEntityX(const WordWithCharacters &first, const WordWithCharacters &second)
972 {
973  QRect firstArea = first.area().roundedGeometry(1000, 1000);
974  QRect secondArea = second.area().roundedGeometry(1000, 1000);
975 
976  return firstArea.left() < secondArea.left();
977 }
978 
979 static bool compareTinyTextEntityY(const WordWithCharacters &first, const WordWithCharacters &second)
980 {
981  const QRect firstArea = first.area().roundedGeometry(1000, 1000);
982  const QRect secondArea = second.area().roundedGeometry(1000, 1000);
983 
984  return firstArea.top() < secondArea.top();
985 }
986 
987 /**
988  * Sets a new world list. Deleting the contents of the old one
989  */
990 void TextPagePrivate::setWordList(const TextList &list)
991 {
992  qDeleteAll(m_words);
993  m_words = list;
994 }
995 
996 /**
997  * Remove all the spaces in between texts. It will make all the generators
998  * same, whether they save spaces(like pdf) or not(like djvu).
999  */
1000 static void removeSpace(TextList *words)
1001 {
1002  TextList::Iterator it = words->begin();
1003  const QString str(QLatin1Char(' '));
1004 
1005  while (it != words->end()) {
1006  if ((*it)->text() == str) {
1007  it = words->erase(it);
1008  } else {
1009  ++it;
1010  }
1011  }
1012 }
1013 
1014 /**
1015  * We will read the TinyTextEntity from characters and try to create words from there.
1016  * Note: characters might be already characters for some generators, but we will keep
1017  * the nomenclature characters for the generator produced data. The resulting
1018  * WordsWithCharacters memory has to be managed by the caller, both the
1019  * WordWithCharacters::word and WordWithCharacters::characters contents
1020  */
1021 static WordsWithCharacters makeWordFromCharacters(const TextList &characters, int pageWidth, int pageHeight)
1022 {
1023  /**
1024  * We will traverse characters and try to create words from the TinyTextEntities in it.
1025  * We will search TinyTextEntity blocks and merge them until we get a
1026  * space between two consecutive TinyTextEntities. When we get a space
1027  * we can take it as a end of word. Then we store the word as a TinyTextEntity
1028  * and keep it in newList.
1029 
1030  * We create a RegionText named regionWord that contains the word and the characters associated with it and
1031  * a rectangle area of the element in newList.
1032 
1033  */
1034  WordsWithCharacters wordsWithCharacters;
1035 
1036  TextList::ConstIterator it = characters.begin(), itEnd = characters.end(), tmpIt;
1037  int newLeft, newRight, newTop, newBottom;
1038  int index = 0;
1039 
1040  for (; it != itEnd; it++) {
1041  QString textString = (*it)->text();
1042  QString newString;
1043  QRect lineArea = (*it)->area.roundedGeometry(pageWidth, pageHeight), elementArea;
1044  TextList wordCharacters;
1045  tmpIt = it;
1046  int space = 0;
1047 
1048  while (!space) {
1049  if (!textString.isEmpty()) {
1050  newString.append(textString);
1051 
1052  // when textString is the start of the word
1053  if (tmpIt == it) {
1054  NormalizedRect newRect(lineArea, pageWidth, pageHeight);
1055  wordCharacters.append(new TinyTextEntity(textString.normalized(QString::NormalizationForm_KC), newRect));
1056  } else {
1057  NormalizedRect newRect(elementArea, pageWidth, pageHeight);
1058  wordCharacters.append(new TinyTextEntity(textString.normalized(QString::NormalizationForm_KC), newRect));
1059  }
1060  }
1061 
1062  ++it;
1063 
1064  /*
1065  we must have to put this line before the if condition of it==itEnd
1066  otherwise the last character can be missed
1067  */
1068  if (it == itEnd) {
1069  break;
1070  }
1071  elementArea = (*it)->area.roundedGeometry(pageWidth, pageHeight);
1072  if (!doesConsumeY(elementArea, lineArea, 60)) {
1073  --it;
1074  break;
1075  }
1076 
1077  const int text_y1 = elementArea.top(), text_x1 = elementArea.left(), text_y2 = elementArea.y() + elementArea.height(), text_x2 = elementArea.x() + elementArea.width();
1078  const int line_y1 = lineArea.top(), line_x1 = lineArea.left(), line_y2 = lineArea.y() + lineArea.height(), line_x2 = lineArea.x() + lineArea.width();
1079 
1080  space = elementArea.left() - lineArea.right();
1081 
1082  if (space != 0) {
1083  it--;
1084  break;
1085  }
1086 
1087  newLeft = text_x1 < line_x1 ? text_x1 : line_x1;
1088  newRight = line_x2 > text_x2 ? line_x2 : text_x2;
1089  newTop = text_y1 > line_y1 ? line_y1 : text_y1;
1090  newBottom = text_y2 > line_y2 ? text_y2 : line_y2;
1091 
1092  lineArea.setLeft(newLeft);
1093  lineArea.setTop(newTop);
1094  lineArea.setWidth(newRight - newLeft);
1095  lineArea.setHeight(newBottom - newTop);
1096 
1097  textString = (*it)->text();
1098  }
1099 
1100  // if newString is not empty, save it
1101  if (!newString.isEmpty()) {
1102  const NormalizedRect newRect(lineArea, pageWidth, pageHeight);
1103  TinyTextEntity *word = new TinyTextEntity(newString.normalized(QString::NormalizationForm_KC), newRect);
1104  wordsWithCharacters.append(WordWithCharacters(word, wordCharacters));
1105 
1106  index++;
1107  }
1108 
1109  if (it == itEnd) {
1110  break;
1111  }
1112  }
1113 
1114  return wordsWithCharacters;
1115 }
1116 
1117 /**
1118  * Create Lines from the words and sort them
1119  */
1120 QList<QPair<WordsWithCharacters, QRect>> makeAndSortLines(const WordsWithCharacters &wordsTmp, int pageWidth, int pageHeight)
1121 {
1122  /**
1123  * We cannot assume that the generator will give us texts in the right order.
1124  * We can only assume that we will get texts in the page and their bounding
1125  * rectangle. The texts can be character, word, half-word anything.
1126  * So, we need to:
1127  **
1128  * 1. Sort rectangles/boxes containing texts by y0(top)
1129  * 2. Create textline where there is y overlap between TinyTextEntity 's
1130  * 3. Within each line sort the TinyTextEntity 's by x0(left)
1131  */
1132 
1134 
1135  /*
1136  Make a new copy of the TextList in the words, so that the wordsTmp and lines do
1137  not contain same pointers for all the TinyTextEntity.
1138  */
1139  QList<WordWithCharacters> words = wordsTmp;
1140 
1141  // Step 1
1142  std::sort(words.begin(), words.end(), compareTinyTextEntityY);
1143 
1144  // Step 2
1145  QList<WordWithCharacters>::Iterator it = words.begin(), itEnd = words.end();
1146 
1147  // for every non-space texts(characters/words) in the textList
1148  for (; it != itEnd; it++) {
1149  const QRect elementArea = (*it).area().roundedGeometry(pageWidth, pageHeight);
1150  bool found = false;
1151 
1152  for (QPair<WordsWithCharacters, QRect> &linesI : lines) {
1153  /* the line area which will be expanded
1154  line_rects is only necessary to preserve the topmin and bottommax of all
1155  the texts in the line, left and right is not necessary at all
1156  */
1157  QRect &lineArea = linesI.second;
1158  const int text_y1 = elementArea.top(), text_y2 = elementArea.top() + elementArea.height(), text_x1 = elementArea.left(), text_x2 = elementArea.left() + elementArea.width();
1159  const int line_y1 = lineArea.top(), line_y2 = lineArea.top() + lineArea.height(), line_x1 = lineArea.left(), line_x2 = lineArea.left() + lineArea.width();
1160 
1161  /*
1162  if the new text and the line has y overlapping parts of more than 70%,
1163  the text will be added to this line
1164  */
1165  if (doesConsumeY(elementArea, lineArea, 70)) {
1166  WordsWithCharacters &line = linesI.first;
1167  line.append(*it);
1168 
1169  const int newLeft = line_x1 < text_x1 ? line_x1 : text_x1;
1170  const int newRight = line_x2 > text_x2 ? line_x2 : text_x2;
1171  const int newTop = line_y1 < text_y1 ? line_y1 : text_y1;
1172  const int newBottom = text_y2 > line_y2 ? text_y2 : line_y2;
1173 
1174  lineArea = QRect(newLeft, newTop, newRight - newLeft, newBottom - newTop);
1175  found = true;
1176  }
1177 
1178  if (found) {
1179  break;
1180  }
1181  }
1182 
1183  /* when we have found a new line create a new TextList containing
1184  only one element and append it to the lines
1185  */
1186  if (!found) {
1187  WordsWithCharacters tmp;
1188  tmp.append((*it));
1189  lines.append(QPair<WordsWithCharacters, QRect>(tmp, elementArea));
1190  }
1191  }
1192 
1193  // Step 3
1194  for (QPair<WordsWithCharacters, QRect> &line : lines) {
1195  WordsWithCharacters &list = line.first;
1196  std::sort(list.begin(), list.end(), compareTinyTextEntityX);
1197  }
1198 
1199  return lines;
1200 }
1201 
1202 /**
1203  * Calculate Statistical information from the lines we made previously
1204  */
1205 static void calculateStatisticalInformation(const QList<WordWithCharacters> &words, int pageWidth, int pageHeight, int *word_spacing, int *line_spacing, int *col_spacing)
1206 {
1207  /**
1208  * For the region, defined by line_rects and lines
1209  * 1. Make line statistical analysis to find the line spacing
1210  * 2. Make character statistical analysis to differentiate between
1211  * word spacing and column spacing.
1212  */
1213 
1214  /**
1215  * Step 0
1216  */
1217  const QList<QPair<WordsWithCharacters, QRect>> sortedLines = makeAndSortLines(words, pageWidth, pageHeight);
1218 
1219  /**
1220  * Step 1
1221  */
1222  QMap<int, int> line_space_stat;
1223  for (int i = 0; i < sortedLines.length(); i++) {
1224  const QRect rectUpper = sortedLines.at(i).second;
1225 
1226  if (i + 1 == sortedLines.length()) {
1227  break;
1228  }
1229  const QRect rectLower = sortedLines.at(i + 1).second;
1230 
1231  int linespace = rectLower.top() - (rectUpper.top() + rectUpper.height());
1232  if (linespace < 0) {
1233  linespace = -linespace;
1234  }
1235 
1236  if (line_space_stat.contains(linespace)) {
1237  line_space_stat[linespace]++;
1238  } else {
1239  line_space_stat[linespace] = 1;
1240  }
1241  }
1242 
1243  *line_spacing = 0;
1244  int weighted_count = 0;
1245  QMapIterator<int, int> iterate_linespace(line_space_stat);
1246 
1247  while (iterate_linespace.hasNext()) {
1248  iterate_linespace.next();
1249  *line_spacing += iterate_linespace.value() * iterate_linespace.key();
1250  weighted_count += iterate_linespace.value();
1251  }
1252  if (*line_spacing != 0) {
1253  *line_spacing = (int)((double)*line_spacing / (double)weighted_count + 0.5);
1254  }
1255 
1256  /**
1257  * Step 2
1258  */
1259  // We would like to use QMap instead of QHash as it will keep the keys sorted
1260  QMap<int, int> hor_space_stat;
1261  QMap<int, int> col_space_stat;
1262  QList<QList<QRect>> space_rects;
1263  QVector<QRect> max_hor_space_rects;
1264 
1265  // Space in every line
1266  for (const QPair<WordsWithCharacters, QRect> &sortedLine : sortedLines) {
1267  const WordsWithCharacters list = sortedLine.first;
1268  QList<QRect> line_space_rects;
1269  int maxSpace = 0, minSpace = pageWidth;
1270 
1271  // for every TinyTextEntity element in the line
1273  QRect max_area1, max_area2;
1274  // for every line
1275  for (; it != itEnd; it++) {
1276  const QRect area1 = (*it).area().roundedGeometry(pageWidth, pageHeight);
1277  if (it + 1 == itEnd) {
1278  break;
1279  }
1280 
1281  const QRect area2 = (*(it + 1)).area().roundedGeometry(pageWidth, pageHeight);
1282  int space = area2.left() - area1.right();
1283 
1284  if (space > maxSpace) {
1285  max_area1 = area1;
1286  max_area2 = area2;
1287  maxSpace = space;
1288  }
1289 
1290  if (space < minSpace && space != 0) {
1291  minSpace = space;
1292  }
1293 
1294  // if we found a real space, whose length is not zero and also less than the pageWidth
1295  if (space != 0 && space != pageWidth) {
1296  // increase the count of the space amount
1297  if (hor_space_stat.contains(space)) {
1298  hor_space_stat[space]++;
1299  } else {
1300  hor_space_stat[space] = 1;
1301  }
1302 
1303  int left, right, top, bottom;
1304 
1305  left = area1.right();
1306  right = area2.left();
1307 
1308  top = area2.top() < area1.top() ? area2.top() : area1.top();
1309  bottom = area2.bottom() > area1.bottom() ? area2.bottom() : area1.bottom();
1310 
1311  QRect rect(left, top, right - left, bottom - top);
1312  line_space_rects.append(rect);
1313  }
1314  }
1315 
1316  space_rects.append(line_space_rects);
1317 
1318  if (hor_space_stat.contains(maxSpace)) {
1319  if (hor_space_stat[maxSpace] != 1) {
1320  hor_space_stat[maxSpace]--;
1321  } else {
1322  hor_space_stat.remove(maxSpace);
1323  }
1324  }
1325 
1326  if (maxSpace != 0) {
1327  if (col_space_stat.contains(maxSpace)) {
1328  col_space_stat[maxSpace]++;
1329  } else {
1330  col_space_stat[maxSpace] = 1;
1331  }
1332 
1333  // store the max rect of each line
1334  const int left = max_area1.right();
1335  const int right = max_area2.left();
1336  const int top = (max_area1.top() > max_area2.top()) ? max_area2.top() : max_area1.top();
1337  const int bottom = (max_area1.bottom() < max_area2.bottom()) ? max_area2.bottom() : max_area1.bottom();
1338 
1339  const QRect rect(left, top, right - left, bottom - top);
1340  max_hor_space_rects.append(rect);
1341  } else {
1342  max_hor_space_rects.append(QRect(0, 0, 0, 0));
1343  }
1344  }
1345 
1346  // All the between word space counts are in hor_space_stat
1347  *word_spacing = 0;
1348  weighted_count = 0;
1349  QMapIterator<int, int> iterate(hor_space_stat);
1350 
1351  while (iterate.hasNext()) {
1352  iterate.next();
1353 
1354  if (iterate.key() > 0) {
1355  *word_spacing += iterate.value() * iterate.key();
1356  weighted_count += iterate.value();
1357  }
1358  }
1359  if (weighted_count) {
1360  *word_spacing = (int)((double)*word_spacing / (double)weighted_count + 0.5);
1361  }
1362 
1363  *col_spacing = 0;
1364  QMapIterator<int, int> iterate_col(col_space_stat);
1365 
1366  while (iterate_col.hasNext()) {
1367  iterate_col.next();
1368  if (iterate_col.value() > *col_spacing) {
1369  *col_spacing = iterate_col.value();
1370  }
1371  }
1372  *col_spacing = col_space_stat.key(*col_spacing);
1373 
1374  // if there is just one line in a region, there is no point in dividing it
1375  if (sortedLines.length() == 1) {
1376  *word_spacing = *col_spacing;
1377  }
1378 }
1379 
1380 /**
1381  * Implements the XY Cut algorithm for textpage segmentation
1382  * The resulting RegionTextList will contain RegionText whose WordsWithCharacters::word and
1383  * WordsWithCharacters::characters are reused from wordsWithCharacters (i.e. no new nor delete happens in this function)
1384  */
1385 static RegionTextList XYCutForBoundingBoxes(const QList<WordWithCharacters> &wordsWithCharacters, int pageWidth, int pageHeight)
1386 {
1387  RegionTextList tree;
1388  QRect contentRect(0, 0, pageWidth, pageHeight);
1389  const RegionText root(wordsWithCharacters, contentRect);
1390 
1391  // start the tree with the root, it is our only region at the start
1392  tree.push_back(root);
1393 
1394  int i = 0;
1395 
1396  // while traversing the tree has not been ended
1397  while (i < tree.length()) {
1398  const RegionText node = tree.at(i);
1399  QRect regionRect = node.area();
1400 
1401  /**
1402  * 1. calculation of projection profiles
1403  */
1404  // allocate the size of proj profiles and initialize with 0
1405  int size_proj_y = node.area().height();
1406  int size_proj_x = node.area().width();
1407  // dynamic memory allocation
1408  QVarLengthArray<int> proj_on_xaxis(size_proj_x);
1409  QVarLengthArray<int> proj_on_yaxis(size_proj_y);
1410 
1411  for (int j = 0; j < size_proj_y; ++j) {
1412  proj_on_yaxis[j] = 0;
1413  }
1414  for (int j = 0; j < size_proj_x; ++j) {
1415  proj_on_xaxis[j] = 0;
1416  }
1417 
1418  const QList<WordWithCharacters> list = node.text();
1419 
1420  // Calculate tcx and tcy locally for each new region
1421  int word_spacing, line_spacing, column_spacing;
1422  calculateStatisticalInformation(list, pageWidth, pageHeight, &word_spacing, &line_spacing, &column_spacing);
1423 
1424  const int tcx = word_spacing * 2;
1425  const int tcy = line_spacing * 2;
1426 
1427  int maxX = 0, maxY = 0;
1428  int avgX = 0;
1429  int count;
1430 
1431  // for every text in the region
1432  for (const WordWithCharacters &wwc : list) {
1433  TinyTextEntity *ent = wwc.word;
1434  const QRect entRect = ent->area.geometry(pageWidth, pageHeight);
1435 
1436  // calculate vertical projection profile proj_on_xaxis1
1437  for (int k = entRect.left(); k <= entRect.left() + entRect.width(); ++k) {
1438  if ((k - regionRect.left()) < size_proj_x && (k - regionRect.left()) >= 0) {
1439  proj_on_xaxis[k - regionRect.left()] += entRect.height();
1440  }
1441  }
1442 
1443  // calculate horizontal projection profile in the same way
1444  for (int k = entRect.top(); k <= entRect.top() + entRect.height(); ++k) {
1445  if ((k - regionRect.top()) < size_proj_y && (k - regionRect.top()) >= 0) {
1446  proj_on_yaxis[k - regionRect.top()] += entRect.width();
1447  }
1448  }
1449  }
1450 
1451  for (int j = 0; j < size_proj_y; ++j) {
1452  if (proj_on_yaxis[j] > maxY) {
1453  maxY = proj_on_yaxis[j];
1454  }
1455  }
1456 
1457  avgX = count = 0;
1458  for (int j = 0; j < size_proj_x; ++j) {
1459  if (proj_on_xaxis[j] > maxX) {
1460  maxX = proj_on_xaxis[j];
1461  }
1462  if (proj_on_xaxis[j]) {
1463  count++;
1464  avgX += proj_on_xaxis[j];
1465  }
1466  }
1467  if (count) {
1468  avgX /= count;
1469  }
1470 
1471  /**
1472  * 2. Cleanup Boundary White Spaces and removal of noise
1473  */
1474  int xbegin = 0, xend = size_proj_x - 1;
1475  int ybegin = 0, yend = size_proj_y - 1;
1476  while (xbegin < size_proj_x && proj_on_xaxis[xbegin] <= 0) {
1477  xbegin++;
1478  }
1479  while (xend >= 0 && proj_on_xaxis[xend] <= 0) {
1480  xend--;
1481  }
1482  while (ybegin < size_proj_y && proj_on_yaxis[ybegin] <= 0) {
1483  ybegin++;
1484  }
1485  while (yend >= 0 && proj_on_yaxis[yend] <= 0) {
1486  yend--;
1487  }
1488 
1489  // update the regionRect
1490  int old_left = regionRect.left(), old_top = regionRect.top();
1491  regionRect.setLeft(old_left + xbegin);
1492  regionRect.setRight(old_left + xend);
1493  regionRect.setTop(old_top + ybegin);
1494  regionRect.setBottom(old_top + yend);
1495 
1496  int tnx = (int)((double)avgX * 10.0 / 100.0 + 0.5), tny = 0;
1497  for (int j = 0; j < size_proj_x; ++j) {
1498  proj_on_xaxis[j] -= tnx;
1499  }
1500  for (int j = 0; j < size_proj_y; ++j) {
1501  proj_on_yaxis[j] -= tny;
1502  }
1503 
1504  /**
1505  * 3. Find the Widest gap
1506  */
1507  int gap_hor = -1, pos_hor = -1;
1508  int begin = -1, end = -1;
1509 
1510  // find all hor_gaps and find the maximum between them
1511  for (int j = 1; j < size_proj_y; ++j) {
1512  // transition from white to black
1513  if (begin >= 0 && proj_on_yaxis[j - 1] <= 0 && proj_on_yaxis[j] > 0) {
1514  end = j;
1515  }
1516 
1517  // transition from black to white
1518  if (proj_on_yaxis[j - 1] > 0 && proj_on_yaxis[j] <= 0) {
1519  begin = j;
1520  }
1521 
1522  if (begin > 0 && end > 0 && end - begin > gap_hor) {
1523  gap_hor = end - begin;
1524  pos_hor = (end + begin) / 2;
1525  begin = -1;
1526  end = -1;
1527  }
1528  }
1529 
1530  begin = -1, end = -1;
1531  int gap_ver = -1, pos_ver = -1;
1532 
1533  // find all the ver_gaps and find the maximum between them
1534  for (int j = 1; j < size_proj_x; ++j) {
1535  // transition from white to black
1536  if (begin >= 0 && proj_on_xaxis[j - 1] <= 0 && proj_on_xaxis[j] > 0) {
1537  end = j;
1538  }
1539 
1540  // transition from black to white
1541  if (proj_on_xaxis[j - 1] > 0 && proj_on_xaxis[j] <= 0) {
1542  begin = j;
1543  }
1544 
1545  if (begin > 0 && end > 0 && end - begin > gap_ver) {
1546  gap_ver = end - begin;
1547  pos_ver = (end + begin) / 2;
1548  begin = -1;
1549  end = -1;
1550  }
1551  }
1552 
1553  int cut_pos_x = pos_ver, cut_pos_y = pos_hor;
1554  int gap_x = gap_ver, gap_y = gap_hor;
1555 
1556  /**
1557  * 4. Cut the region and make nodes (left,right) or (up,down)
1558  */
1559  bool cut_hor = false, cut_ver = false;
1560 
1561  // For horizontal cut
1562  const int topHeight = cut_pos_y - (regionRect.top() - old_top);
1563  const QRect topRect(regionRect.left(), regionRect.top(), regionRect.width(), topHeight);
1564  const QRect bottomRect(regionRect.left(), regionRect.top() + topHeight, regionRect.width(), regionRect.height() - topHeight);
1565 
1566  // For vertical Cut
1567  const int leftWidth = cut_pos_x - (regionRect.left() - old_left);
1568  const QRect leftRect(regionRect.left(), regionRect.top(), leftWidth, regionRect.height());
1569  const QRect rightRect(regionRect.left() + leftWidth, regionRect.top(), regionRect.width() - leftWidth, regionRect.height());
1570 
1571  if (gap_y >= gap_x && gap_y >= tcy) {
1572  cut_hor = true;
1573  } else if (gap_y >= gap_x && gap_y <= tcy && gap_x >= tcx) {
1574  cut_ver = true;
1575  } else if (gap_x >= gap_y && gap_x >= tcx) {
1576  cut_ver = true;
1577  } else if (gap_x >= gap_y && gap_x <= tcx && gap_y >= tcy) {
1578  cut_hor = true;
1579  } else {
1580  // no cut possible
1581  // we can now update the node rectangle with the shrinked rectangle
1582  RegionText tmpNode = tree.at(i);
1583  tmpNode.setArea(regionRect);
1584  tree.replace(i, tmpNode);
1585  i++;
1586  continue;
1587  }
1588 
1589  WordsWithCharacters list1, list2;
1590 
1591  // horizontal cut, topRect and bottomRect
1592  if (cut_hor) {
1593  for (const WordWithCharacters &word : list) {
1594  const QRect wordRect = word.area().geometry(pageWidth, pageHeight);
1595 
1596  if (topRect.intersects(wordRect)) {
1597  list1.append(word);
1598  } else {
1599  list2.append(word);
1600  }
1601  }
1602 
1603  RegionText node1(list1, topRect);
1604  RegionText node2(list2, bottomRect);
1605 
1606  tree.replace(i, node1);
1607  tree.insert(i + 1, node2);
1608  }
1609 
1610  // vertical cut, leftRect and rightRect
1611  else if (cut_ver) {
1612  for (const WordWithCharacters &word : list) {
1613  const QRect wordRect = word.area().geometry(pageWidth, pageHeight);
1614 
1615  if (leftRect.intersects(wordRect)) {
1616  list1.append(word);
1617  } else {
1618  list2.append(word);
1619  }
1620  }
1621 
1622  RegionText node1(list1, leftRect);
1623  RegionText node2(list2, rightRect);
1624 
1625  tree.replace(i, node1);
1626  tree.insert(i + 1, node2);
1627  }
1628  }
1629 
1630  return tree;
1631 }
1632 
1633 /**
1634  * Add spaces in between words in a line. It reuses the pointers passed in tree and might add new ones. You will need to take care of deleting them if needed
1635  */
1636 WordsWithCharacters addNecessarySpace(RegionTextList tree, int pageWidth, int pageHeight)
1637 {
1638  /**
1639  * 1. Call makeAndSortLines before adding spaces in between words in a line
1640  * 2. Now add spaces between every two words in a line
1641  * 3. Finally, extract all the space separated texts from each region and return it
1642  */
1643 
1644  // Only change the texts under RegionTexts, not the area
1645  for (RegionText &tmpRegion : tree) {
1646  // Step 01
1647  QList<QPair<WordsWithCharacters, QRect>> sortedLines = makeAndSortLines(tmpRegion.text(), pageWidth, pageHeight);
1648 
1649  // Step 02
1650  for (QPair<WordsWithCharacters, QRect> &sortedLine : sortedLines) {
1651  WordsWithCharacters &list = sortedLine.first;
1652  for (int k = 0; k < list.length(); k++) {
1653  const QRect area1 = list.at(k).area().roundedGeometry(pageWidth, pageHeight);
1654  if (k + 1 >= list.length()) {
1655  break;
1656  }
1657 
1658  const QRect area2 = list.at(k + 1).area().roundedGeometry(pageWidth, pageHeight);
1659  const int space = area2.left() - area1.right();
1660 
1661  if (space != 0) {
1662  // Make a TinyTextEntity of string space and push it between it and it+1
1663  const int left = area1.right();
1664  const int right = area2.left();
1665  const int top = area2.top() < area1.top() ? area2.top() : area1.top();
1666  const int bottom = area2.bottom() > area1.bottom() ? area2.bottom() : area1.bottom();
1667 
1668  const QString spaceStr(QStringLiteral(" "));
1669  const QRect rect(QPoint(left, top), QPoint(right, bottom));
1670  const NormalizedRect entRect(rect, pageWidth, pageHeight);
1671  TinyTextEntity *ent1 = new TinyTextEntity(spaceStr, entRect);
1672  TinyTextEntity *ent2 = new TinyTextEntity(spaceStr, entRect);
1673  WordWithCharacters word(ent1, QList<TinyTextEntity *>() << ent2);
1674 
1675  list.insert(k + 1, word);
1676 
1677  // Skip the space
1678  k++;
1679  }
1680  }
1681  }
1682 
1683  WordsWithCharacters tmpList;
1684  for (const QPair<WordsWithCharacters, QRect> &sortedLine : qAsConst(sortedLines)) {
1685  tmpList += sortedLine.first;
1686  }
1687  tmpRegion.setText(tmpList);
1688  }
1689 
1690  // Step 03
1691  WordsWithCharacters tmp;
1692  for (const RegionText &tmpRegion : qAsConst(tree)) {
1693  tmp += tmpRegion.text();
1694  }
1695  return tmp;
1696 }
1697 
1698 /**
1699  * Correct the textOrder, all layout recognition works here
1700  */
1701 void TextPagePrivate::correctTextOrder()
1702 {
1703  // m_page->width() and m_page->height() are in pixels at
1704  // 100% zoom level, and thus depend on display DPI.
1705  // To avoid Okular failing on lowDPI displays,
1706  // we scale pageWidth and pageHeight so their sum equals 2000.
1707  const double scalingFactor = 2000.0 / (m_page->width() + m_page->height());
1708  const int pageWidth = (int)(scalingFactor * m_page->width());
1709  const int pageHeight = (int)(scalingFactor * m_page->height());
1710 
1711  TextList characters = m_words;
1712 
1713  /**
1714  * Remove spaces from the text
1715  */
1716  removeSpace(&characters);
1717 
1718  /**
1719  * Construct words from characters
1720  */
1721  const QList<WordWithCharacters> wordsWithCharacters = makeWordFromCharacters(characters, pageWidth, pageHeight);
1722 
1723  /**
1724  * Make a XY Cut tree for segmentation of the texts
1725  */
1726  const RegionTextList tree = XYCutForBoundingBoxes(wordsWithCharacters, pageWidth, pageHeight);
1727 
1728  /**
1729  * Add spaces to the word
1730  */
1731  const WordsWithCharacters listWithWordsAndSpaces = addNecessarySpace(tree, pageWidth, pageHeight);
1732 
1733  /**
1734  * Break the words into characters
1735  */
1736  TextList listOfCharacters;
1737  for (const WordWithCharacters &word : listWithWordsAndSpaces) {
1738  delete word.word;
1739  listOfCharacters.append(word.characters);
1740  }
1741  setWordList(listOfCharacters);
1742 }
1743 
1745 {
1746  if (area && area->isNull()) {
1747  return TextEntity::List();
1748  }
1749 
1750  TextEntity::List ret;
1751  if (area) {
1752  for (const TinyTextEntity *te : qAsConst(d->m_words)) {
1754  if (area->intersects(te->area)) {
1755  ret.append(new TextEntity(te->text(), new Okular::NormalizedRect(te->area)));
1756  }
1757  } else {
1758  const NormalizedPoint center = te->area.center();
1759  if (area->contains(center.x, center.y)) {
1760  ret.append(new TextEntity(te->text(), new Okular::NormalizedRect(te->area)));
1761  }
1762  }
1763  }
1764  } else {
1765  for (const TinyTextEntity *te : qAsConst(d->m_words)) {
1766  ret.append(new TextEntity(te->text(), new Okular::NormalizedRect(te->area)));
1767  }
1768  }
1769  return ret;
1770 }
1771 
1773 {
1774  TextList::ConstIterator itBegin = d->m_words.constBegin(), itEnd = d->m_words.constEnd();
1775  TextList::ConstIterator it = itBegin;
1776  TextList::ConstIterator posIt = itEnd;
1777  for (; it != itEnd; ++it) {
1778  if ((*it)->area.contains(p.x, p.y)) {
1779  posIt = it;
1780  break;
1781  }
1782  }
1783  if (posIt != itEnd) {
1784  if ((*posIt)->text().simplified().isEmpty()) {
1785  return nullptr;
1786  }
1787  // Find the first TinyTextEntity of the word
1788  while (posIt != itBegin) {
1789  --posIt;
1790  const QString itText = (*posIt)->text();
1791  if (itText.right(1).at(0).isSpace()) {
1792  if (itText.endsWith(QLatin1String("-\n"))) {
1793  // Is an hyphenated word
1794  // continue searching the start of the word back
1795  continue;
1796  }
1797 
1798  if (itText == QLatin1String("\n") && posIt != itBegin) {
1799  --posIt;
1800  if ((*posIt)->text().endsWith(QLatin1String("-"))) {
1801  // Is an hyphenated word
1802  // continue searching the start of the word back
1803  continue;
1804  }
1805  ++posIt;
1806  }
1807 
1808  ++posIt;
1809  break;
1810  }
1811  }
1812  RegularAreaRect *ret = new RegularAreaRect();
1813  QString foundWord;
1814  for (; posIt != itEnd; ++posIt) {
1815  const QString itText = (*posIt)->text();
1816  if (itText.simplified().isEmpty()) {
1817  break;
1818  }
1819 
1820  ret->appendShape((*posIt)->area);
1821  foundWord += (*posIt)->text();
1822  if (itText.right(1).at(0).isSpace()) {
1823  if (!foundWord.endsWith(QLatin1String("-\n"))) {
1824  break;
1825  }
1826  }
1827  }
1828 
1829  if (word) {
1830  *word = foundWord;
1831  }
1832  return ret;
1833  } else {
1834  return nullptr;
1835  }
1836 }
void append(const T &value)
T & first()
QTextStream & center(QTextStream &s)
NormalizationForm_KC
QTextStream & right(QTextStream &stream)
bool endsWith(const QString &s, Qt::CaseSensitivity cs) const const
bool contains(double x, double y) const
Returns whether this area contains the normalized point (x, y).
Definition: area.h:866
void setWidth(int width)
bool contains(const Key &key) const const
NormalizedPoint start() const
Returns the start point of the selection.
Definition: misc.cpp:70
double y
The normalized y coordinate.
Definition: area.h:171
const QChar * constData() const const
NormalizedRect transformedArea(const QTransform &matrix) const
Returns the transformed area of the text entity.
Definition: textpage.cpp:205
CaseInsensitive
The documentation to the global Okular namespace.
Definition: action.h:16
int right() const const
QStringRef midRef(int position, int n) const const
QTextStream & left(QTextStream &stream)
SearchDirection
Describes the direction of searching.
Definition: global.h:35
int length() const const
void append(const T &value)
NormalizedPoint is a helper class which stores the coordinates of a normalized point.
Definition: area.h:116
int width() const const
int x() const const
int y() const const
QList::const_iterator constBegin() const const
int x() const const
int y() const const
bool isRight(const NormalizedPoint &pt) const
Returns true if the point pt is located to the left of the right edge of the rectangle.
Definition: area.h:376
KIOFILEWIDGETS_EXPORT QStringList list(const QString &fileClass)
int compare(const QString &other, Qt::CaseSensitivity cs) const const
const QList< QKeySequence > & begin()
~TextEntity()
Destroys the text entity.
Definition: textpage.cpp:190
bool isBottom(const NormalizedPoint &pt) const
Returns true if the point pt is located below the bottom of the rectangle.
Definition: area.h:331
bool intersects(const RegularArea< NormalizedShape, Shape > *area) const
Returns whether this area intersects with the given area.
Definition: area.h:783
QString simplified() const const
int left() const const
TextPage()
Creates a new text page.
Definition: textpage.cpp:223
QString normalized(QString::NormalizationForm mode, QChar::UnicodeVersion version) const const
void setRight(int x)
QRect geometry(int xScale, int yScale) const
Returns the rectangle mapped to a reference area of xScale x yScale.
Definition: area.cpp:232
int remove(const Key &key)
KSERVICE_EXPORT KService::List query(FilterFunc filterFunc)
TextAreaInclusionBehaviour
Defines the behaviour of adding characters to text() result.
Definition: textpage.h:117
int bottom() const const
Q_SCRIPTABLE Q_NOREPLY void start()
int top() const const
RegularAreaRect * textArea(TextSelection *selection) const
Returns the rectangular area of the given selection.
Definition: textpage.cpp:340
Wrapper around the information needed to generate the selection area There are two assumptions inside...
Definition: misc.h:33
bool isSpace() const const
This is a list of NormalizedRect, to describe an area consisting of multiple rectangles using normali...
Definition: area.h:932
void setHeight(int height)
QString text() const
Returns the text of the text entity.
Definition: textpage.cpp:195
void setBottom(int y)
MergeSide
The side(s) to be considered when merging areas.
Definition: global.h:63
bool isEmpty() const const
int length() const const
const T & at(int i) const const
bool isEmpty() const const
@ FromBottom
Searching from bottom of the page, next result is to be found, there was no earlier search result.
Definition: global.h:37
bool isBottomOrLevel(const NormalizedPoint &pt) const
Returns true if the point pt is located below the top of the rectangle.
Definition: area.h:349
QPoint center() const const
A NormalizedRect is a rectangle which can be defined by two NormalizedPoints.
Definition: area.h:188
@ PreviousResult
Searching for the previous result on the page, earlier result should be located so we search from the...
Definition: global.h:39
void append(const QString &text, NormalizedRect *area)
Appends the given text with the given area as new TextEntity to the page.
Definition: textpage.cpp:246
bool isTopOrLevel(const NormalizedPoint &pt) const
Returns true if the point pt is located above the bottom of the rectangle.
Definition: area.h:358
RegularAreaRect * findText(int searchID, const QString &query, SearchDirection direction, Qt::CaseSensitivity caseSensitivity, const RegularAreaRect *area)
Returns the bounding rect of the text which matches the following criteria or 0 if the search is not ...
Definition: textpage.cpp:617
@ MergeRight
Merge only if the right side of the first area intersect.
Definition: global.h:64
const Key key(const T &value, const Key &defaultKey) const const
void insert(int i, const T &value)
@ NextResult
Searching for the next result on the page, earlier result should be located so we search from the las...
Definition: global.h:38
bool startsWith(const QString &s, Qt::CaseSensitivity cs) const const
typedef ConstIterator
void setTop(int y)
double x
The normalized x coordinate.
Definition: area.h:166
int height() const const
QList::const_iterator constEnd() const const
NormalizedRect * area() const
Returns the bounding area of the text entity.
Definition: textpage.cpp:200
TextEntity::List words(const RegularAreaRect *area, TextAreaInclusionBehaviour b) const
Text entity extraction function.
Definition: textpage.cpp:1744
void transform(const QTransform &matrix)
Transforms the normalized rectangle with the operations defined by matrix.
Definition: area.cpp:253
QString right(int n) const const
@ FromTop
Searching from top of the page, next result is to be found, there was no earlier search result.
Definition: global.h:36
QString fromRawData(const QChar *unicode, int size)
const QChar at(int position) const const
void end(const NormalizedPoint &point)
Changes the end point of the selection to the given point.
Definition: misc.cpp:43
bool isTop(const NormalizedPoint &pt) const
Returns true if the point pt is located above the top of the rectangle.
Definition: area.h:340
QList::iterator begin()
bool intersects(const NormalizedRect &other) const
Returns whether the normalized rectangle intersects the other normalized rectangle.
Definition: area.cpp:165
bool contains(double x, double y) const
Returns whether the normalized rectangle contains the normalized point (x, y).
Definition: area.cpp:160
bool isNull() const
Returns whether the regular area is a null area.
Definition: area.h:751
void simplify()
Simplifies this regular area by merging its intersecting subareas.
Definition: area.h:730
void appendShape(const NormalizedShape &shape, MergeSide side=MergeAll)
Appends the given shape to this area.
Definition: area.h:810
Represents a piece of text on a TextPage, containing its textual representation and its bounding box.
Definition: textpage.h:51
RegularAreaRect * wordAt(const NormalizedPoint &p, QString *word=nullptr) const
Returns the area and text of the word at the given point Note that ownership of the returned area bel...
Definition: textpage.cpp:1772
QList::iterator end()
TextEntity(const QString &text, NormalizedRect *area)
Creates a new text entity with the given text and the given area.
Definition: textpage.cpp:183
@ AnyPixelTextAreaInclusionBehaviour
A character is included into text() result if any pixel of his bounding box is in the given area.
Definition: textpage.h:118
void setLeft(int x)
bool isLeft(const NormalizedPoint &pt) const
Returns true if the point pt is located to the right of the left edge of the rectangle.
Definition: area.h:367
const QList< QKeySequence > & end()
QString & append(QChar ch)
double top
The normalized top coordinate.
Definition: area.h:425
ushort unicode() const const
QString text(const RegularAreaRect *area=nullptr) const
Text extraction function.
Definition: textpage.cpp:937
double bottom
The normalized bottom coordinate.
Definition: area.h:435
virtual QVariant get(ScriptableExtension *callerPrincipal, quint64 objId, const QString &propName)
~TextPage()
Destroys the text page.
Definition: textpage.cpp:241
This file is part of the KDE documentation.
Documentation copyright © 1996-2023 The KDE developers.
Generated on Fri Sep 22 2023 04:09:42 by doxygen 1.8.17 written by Dimitri van Heesch, © 1997-2006

KDE's Doxygen guidelines are available online.