Okular

textpage.cpp
1 /***************************************************************************
2  * Copyright (C) 2005 by Piotr Szymanski <[email protected]> *
3  * *
4  * This program is free software; you can redistribute it and/or modify *
5  * it under the terms of the GNU General Public License as published by *
6  * the Free Software Foundation; either version 2 of the License, or *
7  * (at your option) any later version. *
8  ***************************************************************************/
9 
10 #include "textpage.h"
11 #include "textpage_p.h"
12 
13 #include <QDebug>
14 
15 #include "area.h"
16 #include "debug_p.h"
17 #include "misc.h"
18 #include "page.h"
19 #include "page_p.h"
20 
21 #include <cstring>
22 
23 #include <QVarLengthArray>
24 #include <QtAlgorithms>
25 
26 using namespace Okular;
27 
28 class SearchPoint
29 {
30 public:
31  SearchPoint()
32  : offset_begin(-1)
33  , offset_end(-1)
34  {
35  }
36 
38  TextList::ConstIterator it_begin;
39 
41  TextList::ConstIterator it_end;
42 
46  int offset_begin;
47 
51  int offset_end;
52 };
53 
54 /* text comparison functions */
55 
56 static bool CaseInsensitiveCmpFn(const QStringRef &from, const QStringRef &to)
57 {
58 #ifdef DEBUG_TEXTPAGE
59  qDebug(OkularCoreDebug) << from << ":" << to << "(case insensitive)";
60 #endif
61  return from.compare(to, Qt::CaseInsensitive) == 0;
62 }
63 
64 static bool CaseSensitiveCmpFn(const QStringRef &from, const QStringRef &to)
65 {
66 #ifdef DEBUG_TEXTPAGE
67  qDebug(OkularCoreDebug) << from << ":" << to << "(case sensitive)";
68 #endif
69  return from.compare(to, Qt::CaseSensitive) == 0;
70 }
71 
78 static bool segmentsOverlap(double left1, double right1, double left2, double right2, int threshold)
79 {
80  // check if one consumes another fully (speed optimization)
81 
82  if (left1 <= left2 && right1 >= right2)
83  return true;
84 
85  if (left1 >= left2 && right1 <= right2)
86  return true;
87 
88  // check if there is overlap above threshold
89  if (right2 >= left1 && right1 >= left2) {
90  double overlap = (right2 >= right1) ? right1 - left2 : right2 - left1;
91 
92  double length1 = right1 - left1, length2 = right2 - left2;
93 
94  return overlap * 100 >= threshold * qMin(length1, length2);
95  }
96 
97  return false;
98 }
99 
100 static bool doesConsumeY(const QRect first, const QRect second, int threshold)
101 {
102  return segmentsOverlap(first.top(), first.bottom(), second.top(), second.bottom(), threshold);
103 }
104 
105 static bool doesConsumeY(const NormalizedRect &first, const NormalizedRect &second, int threshold)
106 {
107  return segmentsOverlap(first.top, first.bottom, second.top, second.bottom, threshold);
108 }
109 
110 /*
111  Rationale behind TinyTextEntity:
112 
113  instead of storing directly a QString for the text of an entity,
114  we store the UTF-16 data and their length. This way, we save about
115  4 int's wrt a QString, and we can create a new string from that
116  raw data (that's the only penalty of that).
117  Even better, if the string we need to store has at most
118  MaxStaticChars characters, then we store those in place of the QChar*
119  that would be used (with new[] + free[]) for the data.
120  */
121 class TinyTextEntity
122 {
123  static const int MaxStaticChars = sizeof(void *) / sizeof(QChar);
124 
125 public:
126  TinyTextEntity(const QString &text, const NormalizedRect &rect)
127  : area(rect)
128  {
129  Q_ASSERT_X(!text.isEmpty(), "TinyTextEntity", "empty string");
130  Q_ASSERT_X(sizeof(d) == sizeof(void *), "TinyTextEntity", "internal storage is wider than QChar*, fix it!");
131  length = text.length();
132  switch (length) {
133 #if QT_POINTER_SIZE >= 8
134  case 4:
135  d.qc[3] = text.at(3).unicode();
136  // fall through
137  case 3:
138  d.qc[2] = text.at(2).unicode();
139 #endif
140  // fall through
141  case 2:
142  d.qc[1] = text.at(1).unicode();
143  // fall through
144  case 1:
145  d.qc[0] = text.at(0).unicode();
146  break;
147  default:
148  d.data = new QChar[length];
149  std::memcpy(d.data, text.constData(), length * sizeof(QChar));
150  }
151  }
152 
153  ~TinyTextEntity()
154  {
155  if (length > MaxStaticChars) {
156  delete[] d.data;
157  }
158  }
159 
160  inline QString text() const
161  {
162  return length <= MaxStaticChars ? QString::fromRawData((const QChar *)&d.qc[0], length) : QString::fromRawData(d.data, length);
163  }
164 
165  inline NormalizedRect transformedArea(const QTransform &matrix) const
166  {
167  NormalizedRect transformed_area = area;
168  transformed_area.transform(matrix);
169  return transformed_area;
170  }
171 
172  NormalizedRect area;
173 
174 private:
175  Q_DISABLE_COPY(TinyTextEntity)
176 
177  union {
178  QChar *data;
179  ushort qc[MaxStaticChars];
180  } d;
181  int length;
182 };
183 
185  : m_text(text)
186  , m_area(area)
187  , d(nullptr)
188 {
189 }
190 
192 {
193  delete m_area;
194 }
195 
197 {
198  return m_text;
199 }
200 
202 {
203  return m_area;
204 }
205 
207 {
208  NormalizedRect transformed_area = *m_area;
209  transformed_area.transform(matrix);
210  return transformed_area;
211 }
212 
213 TextPagePrivate::TextPagePrivate()
214  : m_page(nullptr)
215 {
216 }
217 
218 TextPagePrivate::~TextPagePrivate()
219 {
220  qDeleteAll(m_searchPoints);
221  qDeleteAll(m_words);
222 }
223 
225  : d(new TextPagePrivate())
226 {
227 }
228 
230  : d(new TextPagePrivate())
231 {
232  TextEntity::List::ConstIterator it = words.constBegin(), itEnd = words.constEnd();
233  for (; it != itEnd; ++it) {
234  TextEntity *e = *it;
235  if (!e->text().isEmpty())
236  d->m_words.append(new TinyTextEntity(e->text(), *e->area()));
237  delete e;
238  }
239 }
240 
242 {
243  delete d;
244 }
245 
246 void TextPage::append(const QString &text, NormalizedRect *area)
247 {
248  if (!text.isEmpty())
249  d->m_words.append(new TinyTextEntity(text.normalized(QString::NormalizationForm_KC), *area));
250  delete area;
251 }
252 
253 struct WordWithCharacters {
254  WordWithCharacters(TinyTextEntity *w, const TextList &c)
255  : word(w)
256  , characters(c)
257  {
258  }
259 
260  inline QString text() const
261  {
262  return word->text();
263  }
264 
265  inline const NormalizedRect &area() const
266  {
267  return word->area;
268  }
269 
270  TinyTextEntity *word;
271  TextList characters;
272 };
274 
280 class RegionText
281 {
282 public:
283  RegionText() {};
284 
285  RegionText(const WordsWithCharacters &wordsWithCharacters, const QRect area)
286  : m_region_wordWithCharacters(wordsWithCharacters)
287  , m_area(area)
288  {
289  }
290 
291  inline QString string() const
292  {
293  QString res;
294  for (const WordWithCharacters &word : m_region_wordWithCharacters) {
295  res += word.text();
296  }
297  return res;
298  }
299 
300  inline WordsWithCharacters text() const
301  {
302  return m_region_wordWithCharacters;
303  }
304 
305  inline QRect area() const
306  {
307  return m_area;
308  }
309 
310  inline void setArea(const QRect area)
311  {
312  m_area = area;
313  }
314 
315  inline void setText(const WordsWithCharacters &wordsWithCharacters)
316  {
317  m_region_wordWithCharacters = wordsWithCharacters;
318  }
319 
320 private:
321  WordsWithCharacters m_region_wordWithCharacters;
322  QRect m_area;
323 };
324 
326 {
327  if (d->m_words.isEmpty())
328  return new RegularAreaRect();
329 
344  RegularAreaRect *ret = new RegularAreaRect;
345 
346  PagePrivate *pagePrivate = PagePrivate::get(d->m_page);
347  const QTransform matrix = pagePrivate ? pagePrivate->rotationMatrix() : QTransform();
348 #if 0
349  int it = -1;
350  int itB = -1;
351  int itE = -1;
352 
353  // ending cursor is higher than start cursor, we need to find positions in reverse
354  NormalizedRect tmp;
355  NormalizedRect start;
356  NormalizedRect end;
357 
358  NormalizedPoint startC = sel->start();
359  double startCx = startC.x;
360  double startCy = startC.y;
361 
362  NormalizedPoint endC = sel->end();
363  double endCx = endC.x;
364  double endCy = endC.y;
365 
366  if ( sel->direction() == 1 || ( sel->itB() == -1 && sel->direction() == 0 ) )
367  {
368 #ifdef DEBUG_TEXTPAGE
369  qCWarning(OkularCoreDebug) << "running first loop";
370 #endif
371  const int count = d->m_words.count();
372  for ( it = 0; it < count; it++ )
373  {
374  tmp = *d->m_words[ it ]->area();
375  if ( tmp.contains( startCx, startCy )
376  || ( tmp.top <= startCy && tmp.bottom >= startCy && tmp.left >= startCx )
377  || ( tmp.top >= startCy))
378  {
380  itB = it;
381 #ifdef DEBUG_TEXTPAGE
382  qCWarning(OkularCoreDebug) << "start is" << itB << "count is" << d->m_words.count();
383 #endif
384  break;
385  }
386  }
387  sel->itB( itB );
388  }
389  itB = sel->itB();
390 #ifdef DEBUG_TEXTPAGE
391  qCWarning(OkularCoreDebug) << "direction is" << sel->direction();
392  qCWarning(OkularCoreDebug) << "reloaded start is" << itB << "against" << sel->itB();
393 #endif
394  if ( sel->direction() == 0 || ( sel->itE() == -1 && sel->direction() == 1 ) )
395  {
396 #ifdef DEBUG_TEXTPAGE
397  qCWarning(OkularCoreDebug) << "running second loop";
398 #endif
399  for ( it = d->m_words.count() - 1; it >= itB; it-- )
400  {
401  tmp = *d->m_words[ it ]->area();
402  if ( tmp.contains( endCx, endCy )
403  || ( tmp.top <= endCy && tmp.bottom >= endCy && tmp.right <= endCx )
404  || ( tmp.bottom <= endCy ) )
405  {
407  itE = it;
408 #ifdef DEBUG_TEXTPAGE
409  qCWarning(OkularCoreDebug) << "ending is" << itE << "count is" << d->m_words.count();
410  qCWarning(OkularCoreDebug) << "conditions" << tmp.contains( endCx, endCy ) << " "
411  << ( tmp.top <= endCy && tmp.bottom >= endCy && tmp.right <= endCx ) << " " <<
412  ( tmp.top >= endCy);
413 #endif
414  break;
415  }
416  }
417  sel->itE( itE );
418  }
419 #ifdef DEBUG_TEXTPAGE
420  qCWarning(OkularCoreDebug) << "reloaded ending is" << itE << "against" << sel->itE();
421 #endif
422 
423  if ( sel->itB() != -1 && sel->itE() != -1 )
424  {
425  start = *d->m_words[ sel->itB() ]->area();
426  end = *d->m_words[ sel->itE() ]->area();
427 
428  NormalizedRect first, second, third;
432  first = start;
433  second.top = start.bottom;
434  first.right = second.right = 1;
435  third = end;
436  third.left = second.left = 0;
437  second.bottom = end.top;
438  int selMax = qMax( sel->itB(), sel->itE() );
439  for ( it = qMin( sel->itB(), sel->itE() ); it <= selMax; ++it )
440  {
441  tmp = *d->m_words[ it ]->area();
442  if ( tmp.intersects( &first ) || tmp.intersects( &second ) || tmp.intersects( &third ) )
443  ret->appendShape( d->m_words.at( it )->transformedArea( matrix ) );
444  }
445  }
446 #else
447  const double scaleX = d->m_page->width();
448  const double scaleY = d->m_page->height();
449 
450  NormalizedPoint startC = sel->start();
451  NormalizedPoint endC = sel->end();
452  NormalizedPoint temp;
453 
454  // if startPoint is right to endPoint swap them
455  if (startC.x > endC.x) {
456  temp = startC;
457  startC = endC;
458  endC = temp;
459  }
460 
461  // minX,maxX,minY,maxY gives the bounding rectangle coordinates of the document
462  const NormalizedRect boundingRect = d->m_page->boundingBox();
463  const QRect content = boundingRect.geometry(scaleX, scaleY);
464  const double minX = content.left();
465  const double maxX = content.right();
466  const double minY = content.top();
467  const double maxY = content.bottom();
468 
500  // we know that startC.x > endC.x, we need to decide which is top and which is bottom
501  const NormalizedRect start_end = (startC.y < endC.y) ? NormalizedRect(startC.x, startC.y, endC.x, endC.y) : NormalizedRect(startC.x, endC.y, endC.x, startC.y);
502 
503  // Case 1(a)
504  if (!boundingRect.intersects(start_end))
505  return ret;
506 
507  // case 1(b)
513  else {
514  // if start is left to content rect take it to content rect boundary
515  if (startC.x * scaleX < minX)
516  startC.x = minX / scaleX;
517  if (endC.x * scaleX > maxX)
518  endC.x = maxX / scaleX;
519 
520  // if start is top to end (selection type 01)
521  if (startC.y * scaleY < minY)
522  startC.y = minY / scaleY;
523  if (endC.y * scaleY > maxY)
524  endC.y = maxY / scaleY;
525 
526  // if start is bottom to end (selection type 02)
527  if (startC.y * scaleY > maxY)
528  startC.y = maxY / scaleY;
529  if (endC.y * scaleY < minY)
530  endC.y = minY / scaleY;
531  }
532 
533  TextList::ConstIterator it = d->m_words.constBegin(), itEnd = d->m_words.constEnd();
534  TextList::ConstIterator start = it, end = itEnd, tmpIt = it; //, tmpItEnd = itEnd;
535  const MergeSide side = d->m_page ? (MergeSide)d->m_page->totalOrientation() : MergeRight;
536 
537  NormalizedRect tmp;
538  // case 2(a)
539  for (; it != itEnd; ++it) {
540  tmp = (*it)->area;
541  if (tmp.contains(startC.x, startC.y)) {
542  start = it;
543  }
544  if (tmp.contains(endC.x, endC.y)) {
545  end = it;
546  }
547  }
548 
549  // case 2(b)
550  it = tmpIt;
551  if (start == it && end == itEnd) {
552  for (; it != itEnd; ++it) {
553  // is there any text rectangle within the start_end rect
554  tmp = (*it)->area;
555  if (start_end.intersects(tmp))
556  break;
557  }
558 
559  // we have searched every text entities, but none is within the rectangle created by start and end
560  // so, no selection should be done
561  if (it == itEnd) {
562  return ret;
563  }
564  }
565  it = tmpIt;
566  bool selection_two_start = false;
567 
568  // case 3.a
569  if (start == it) {
570  bool flagV = false;
571  NormalizedRect rect;
572 
573  // selection type 01
574  if (startC.y <= endC.y) {
575  for (; it != itEnd; ++it) {
576  rect = (*it)->area;
577  rect.isBottom(startC) ? flagV = false : flagV = true;
578 
579  if (flagV && rect.isRight(startC)) {
580  start = it;
581  break;
582  }
583  }
584  }
585 
586  // selection type 02
587  else {
588  selection_two_start = true;
589  int distance = scaleX + scaleY + 100;
590  int count = 0;
591 
592  for (; it != itEnd; ++it) {
593  rect = (*it)->area;
594 
595  if (rect.isBottomOrLevel(startC) && rect.isRight(startC)) {
596  count++;
597  QRect entRect = rect.geometry(scaleX, scaleY);
598  int xdist, ydist;
599  xdist = entRect.center().x() - startC.x * scaleX;
600  ydist = entRect.center().y() - startC.y * scaleY;
601 
602  // make them positive
603  if (xdist < 0)
604  xdist = -xdist;
605  if (ydist < 0)
606  ydist = -ydist;
607 
608  if ((xdist + ydist) < distance) {
609  distance = xdist + ydist;
610  start = it;
611  }
612  }
613  }
614  }
615  }
616 
617  // case 3.b
618  if (end == itEnd) {
619  it = tmpIt;
620  itEnd = itEnd - 1;
621 
622  bool flagV = false;
623  NormalizedRect rect;
624 
625  if (startC.y <= endC.y) {
626  for (; itEnd >= it; itEnd--) {
627  rect = (*itEnd)->area;
628  rect.isTop(endC) ? flagV = false : flagV = true;
629 
630  if (flagV && rect.isLeft(endC)) {
631  end = itEnd;
632  break;
633  }
634  }
635  }
636 
637  else {
638  int distance = scaleX + scaleY + 100;
639  for (; itEnd >= it; itEnd--) {
640  rect = (*itEnd)->area;
641 
642  if (rect.isTopOrLevel(endC) && rect.isLeft(endC)) {
643  QRect entRect = rect.geometry(scaleX, scaleY);
644  int xdist, ydist;
645  xdist = entRect.center().x() - endC.x * scaleX;
646  ydist = entRect.center().y() - endC.y * scaleY;
647 
648  // make them positive
649  if (xdist < 0)
650  xdist = -xdist;
651  if (ydist < 0)
652  ydist = -ydist;
653 
654  if ((xdist + ydist) < distance) {
655  distance = xdist + ydist;
656  end = itEnd;
657  }
658  }
659  }
660  }
661  }
662 
663  /* if start and end in selection 02 are in the same column, and we
664  start at an empty space we have to remove the selection of last
665  character
666  */
667  if (selection_two_start) {
668  if (start > end) {
669  start = start - 1;
670  }
671  }
672 
673  // if start is less than end swap them
674  if (start > end) {
675  it = start;
676  start = end;
677  end = it;
678  }
679 
680  // removes the possibility of crash, in case none of 1 to 3 is true
681  if (end == d->m_words.constEnd())
682  end--;
683 
684  for (; start <= end; start++) {
685  ret->appendShape((*start)->transformedArea(matrix), side);
686  }
687 
688 #endif
689 
690  return ret;
691 }
692 
693 RegularAreaRect *TextPage::findText(int searchID, const QString &query, SearchDirection direct, Qt::CaseSensitivity caseSensitivity, const RegularAreaRect *area)
694 {
695  SearchDirection dir = direct;
696  // invalid search request
697  if (d->m_words.isEmpty() || query.isEmpty() || (area && area->isNull()))
698  return nullptr;
699  TextList::ConstIterator start;
700  int start_offset = 0;
701  TextList::ConstIterator end;
702  const QMap<int, SearchPoint *>::const_iterator sIt = d->m_searchPoints.constFind(searchID);
703  if (sIt == d->m_searchPoints.constEnd()) {
704  // if no previous run of this search is found, then set it to start
705  // from the beginning (respecting the search direction)
706  if (dir == NextResult)
707  dir = FromTop;
708  else if (dir == PreviousResult)
709  dir = FromBottom;
710  }
711  bool forward = true;
712  switch (dir) {
713  case FromTop:
714  start = d->m_words.constBegin();
715  start_offset = 0;
716  end = d->m_words.constEnd();
717  break;
718  case FromBottom:
719  start = d->m_words.constEnd();
720  start_offset = 0;
721  end = d->m_words.constBegin();
722  forward = false;
723  break;
724  case NextResult:
725  start = (*sIt)->it_end;
726  start_offset = (*sIt)->offset_end;
727  end = d->m_words.constEnd();
728  break;
729  case PreviousResult:
730  start = (*sIt)->it_begin;
731  start_offset = (*sIt)->offset_begin;
732  end = d->m_words.constBegin();
733  forward = false;
734  break;
735  };
736  RegularAreaRect *ret = nullptr;
737  const TextComparisonFunction cmpFn = caseSensitivity == Qt::CaseSensitive ? CaseSensitiveCmpFn : CaseInsensitiveCmpFn;
738  if (forward) {
739  ret = d->findTextInternalForward(searchID, query, cmpFn, start, start_offset, end);
740  } else {
741  ret = d->findTextInternalBackward(searchID, query, cmpFn, start, start_offset, end);
742  }
743  return ret;
744 }
745 
746 // hyphenated '-' must be at the end of a word, so hyphenation means
747 // we have a '-' just followed by a '\n' character
748 // check if the string contains a '-' character
749 // if the '-' is the last entry
750 static int stringLengthAdaptedWithHyphen(const QString &str, const TextList::ConstIterator &it, const TextList::ConstIterator &textListEnd)
751 {
752  const int len = str.length();
753 
754  // hyphenated '-' must be at the end of a word, so hyphenation means
755  // we have a '-' just followed by a '\n' character
756  // check if the string contains a '-' character
757  // if the '-' is the last entry
758  if (str.endsWith(QLatin1Char('-'))) {
759  // validity chek of it + 1
760  if ((it + 1) != textListEnd) {
761  // 1. if the next character is '\n'
762  const QString &lookahedStr = (*(it + 1))->text();
763  if (lookahedStr.startsWith(QLatin1Char('\n'))) {
764  return len - 1;
765  }
766 
767  // 2. if the next word is in a different line or not
768  const NormalizedRect &hyphenArea = (*it)->area;
769  const NormalizedRect &lookaheadArea = (*(it + 1))->area;
770 
771  // lookahead to check whether both the '-' rect and next character rect overlap
772  if (!doesConsumeY(hyphenArea, lookaheadArea, 70)) {
773  return len - 1;
774  }
775  }
776  }
777  // else if it is the second last entry - for example in pdf format
778  else if (str.endsWith(QLatin1String("-\n"))) {
779  return len - 2;
780  }
781 
782  return len;
783 }
784 
785 RegularAreaRect *TextPagePrivate::searchPointToArea(const SearchPoint *sp)
786 {
787  PagePrivate *pagePrivate = PagePrivate::get(m_page);
788  const QTransform matrix = pagePrivate ? pagePrivate->rotationMatrix() : QTransform();
789  RegularAreaRect *ret = new RegularAreaRect;
790 
791  for (TextList::ConstIterator it = sp->it_begin;; it++) {
792  const TinyTextEntity *curEntity = *it;
793  ret->append(curEntity->transformedArea(matrix));
794 
795  if (it == sp->it_end) {
796  break;
797  }
798  }
799 
800  ret->simplify();
801  return ret;
802 }
803 
804 RegularAreaRect *TextPagePrivate::findTextInternalForward(int searchID, const QString &_query, TextComparisonFunction comparer, const TextList::ConstIterator &start, int start_offset, const TextList::ConstIterator &end)
805 {
806  // normalize query search all unicode (including glyphs)
807  const QString query = _query.normalized(QString::NormalizationForm_KC);
808 
809  // j is the current position in our query
810  // queryLeft is the length of the query we have left to match
811  int j = 0, queryLeft = query.length();
812 
813  TextList::ConstIterator it = start;
814  int offset = start_offset;
815 
816  TextList::ConstIterator it_begin = TextList::ConstIterator();
817  int offset_begin = 0; // dummy initial value to suppress compiler warnings
818 
819  while (it != end) {
820  const TinyTextEntity *curEntity = *it;
821  const QString &str = curEntity->text();
822  const int strLen = str.length();
823  const int adjustedLen = stringLengthAdaptedWithHyphen(str, it, m_words.constEnd());
824  // adjustedLen <= strLen
825 
826  if (offset >= strLen) {
827  it++;
828  offset = 0;
829  continue;
830  }
831 
832  if (it_begin == TextList::ConstIterator()) {
833  it_begin = it;
834  offset_begin = offset;
835  }
836 
837  // Let the user write the hyphen or not when searching for text
838  int matchedLen = -1;
839  for (int matchingLen = strLen; matchingLen >= adjustedLen; matchingLen--) {
840  // we have equal (or less than) area of the query left as the length of the current
841  // entity
842  const int min = qMin(queryLeft, matchingLen - offset);
843  if (comparer(str.midRef(offset, min), query.midRef(j, min))) {
844  matchedLen = min;
845  break;
846  }
847  }
848 
849  if (matchedLen == -1) {
850  // we have not matched
851  // this means we do not have a complete match
852  // we need to get back to query start
853  // and continue the search from this place
854 #ifdef DEBUG_TEXTPAGE
855  qCDebug(OkularCoreDebug) << "\tnot matched";
856 #endif
857  j = 0;
858  queryLeft = query.length();
859  it = it_begin;
860  offset = offset_begin + 1;
861  it_begin = TextList::ConstIterator();
862  } else {
863  // we have a match
864  // move the current position in the query
865  // to the position after the length of this string
866  // we matched
867  // subtract the length of the current entity from
868  // the left length of the query
869 #ifdef DEBUG_TEXTPAGE
870  qCDebug(OkularCoreDebug) << "\tmatched" << matchedLen;
871 #endif
872  j += matchedLen;
873  queryLeft -= matchedLen;
874 
875  if (queryLeft == 0) {
876  // save or update the search point for the current searchID
877  QMap<int, SearchPoint *>::iterator sIt = m_searchPoints.find(searchID);
878  if (sIt == m_searchPoints.end()) {
879  sIt = m_searchPoints.insert(searchID, new SearchPoint);
880  }
881  SearchPoint *sp = *sIt;
882  sp->it_begin = it_begin;
883  sp->it_end = it;
884  sp->offset_begin = offset_begin;
885  sp->offset_end = offset + matchedLen;
886  return searchPointToArea(sp);
887  }
888 
889  it++;
890  offset = 0;
891  }
892  }
893  // end of loop - it means that we've ended the textentities
894 
895  const QMap<int, SearchPoint *>::iterator sIt = m_searchPoints.find(searchID);
896  if (sIt != m_searchPoints.end()) {
897  SearchPoint *sp = *sIt;
898  m_searchPoints.erase(sIt);
899  delete sp;
900  }
901  return nullptr;
902 }
903 
904 RegularAreaRect *TextPagePrivate::findTextInternalBackward(int searchID, const QString &_query, TextComparisonFunction comparer, const TextList::ConstIterator &start, int start_offset, const TextList::ConstIterator &end)
905 {
906  // normalize query to search all unicode (including glyphs)
907  const QString query = _query.normalized(QString::NormalizationForm_KC);
908 
909  // j is the current position in our query
910  // len is the length of the string in TextEntity
911  // queryLeft is the length of the query we have left
912  int j = query.length(), queryLeft = query.length();
913 
914  TextList::ConstIterator it = start;
915  int offset = start_offset;
916 
917  TextList::ConstIterator it_begin = TextList::ConstIterator();
918  int offset_begin = 0; // dummy initial value to suppress compiler warnings
919 
920  while (true) {
921  if (offset <= 0) {
922  if (it == end) {
923  break;
924  }
925  it--;
926  }
927 
928  const TinyTextEntity *curEntity = *it;
929  const QString &str = curEntity->text();
930  const int strLen = str.length();
931  const int adjustedLen = stringLengthAdaptedWithHyphen(str, it, m_words.constEnd());
932  // adjustedLen <= strLen
933 
934  if (offset <= 0) {
935  offset = strLen;
936  }
937 
938  if (it_begin == TextList::ConstIterator()) {
939  it_begin = it;
940  offset_begin = offset;
941  }
942 
943  // Let the user write the hyphen or not when searching for text
944  int matchedLen = -1;
945  // we have equal (or less than) area of the query left as the length of the current
946  // entity
947  for (int matchingLen = strLen; matchingLen >= adjustedLen; matchingLen--) {
948  const int hyphenOffset = (strLen - matchingLen);
949  const int min = qMin(queryLeft + hyphenOffset, offset);
950  if (comparer(str.midRef(offset - min, min - hyphenOffset), query.midRef(j - min + hyphenOffset, min - hyphenOffset))) {
951  matchedLen = min - hyphenOffset;
952  break;
953  }
954  }
955 
956  if (matchedLen == -1) {
957  // we have not matched
958  // this means we do not have a complete match
959  // we need to get back to query start
960  // and continue the search from this place
961 #ifdef DEBUG_TEXTPAGE
962  qCDebug(OkularCoreDebug) << "\tnot matched";
963 #endif
964 
965  j = query.length();
966  queryLeft = query.length();
967  it = it_begin;
968  offset = offset_begin - 1;
969  it_begin = TextList::ConstIterator();
970  } else {
971  // we have a match
972  // move the current position in the query
973  // to the position after the length of this string
974  // we matched
975  // subtract the length of the current entity from
976  // the left length of the query
977 #ifdef DEBUG_TEXTPAGE
978  qCDebug(OkularCoreDebug) << "\tmatched";
979 #endif
980  j -= matchedLen;
981  queryLeft -= matchedLen;
982 
983  if (queryLeft == 0) {
984  // save or update the search point for the current searchID
985  QMap<int, SearchPoint *>::iterator sIt = m_searchPoints.find(searchID);
986  if (sIt == m_searchPoints.end()) {
987  sIt = m_searchPoints.insert(searchID, new SearchPoint);
988  }
989  SearchPoint *sp = *sIt;
990  sp->it_begin = it;
991  sp->it_end = it_begin;
992  sp->offset_begin = offset - matchedLen;
993  sp->offset_end = offset_begin;
994  return searchPointToArea(sp);
995  }
996 
997  offset = 0;
998  }
999  }
1000  // end of loop - it means that we've ended the textentities
1001 
1002  const QMap<int, SearchPoint *>::iterator sIt = m_searchPoints.find(searchID);
1003  if (sIt != m_searchPoints.end()) {
1004  SearchPoint *sp = *sIt;
1005  m_searchPoints.erase(sIt);
1006  delete sp;
1007  }
1008  return nullptr;
1009 }
1010 
1012 {
1014 }
1015 
1017 {
1018  if (area && area->isNull())
1019  return QString();
1020 
1021  TextList::ConstIterator it = d->m_words.constBegin(), itEnd = d->m_words.constEnd();
1022  QString ret;
1023  if (area) {
1024  for (; it != itEnd; ++it) {
1026  if (area->intersects((*it)->area)) {
1027  ret += (*it)->text();
1028  }
1029  } else {
1030  NormalizedPoint center = (*it)->area.center();
1031  if (area->contains(center.x, center.y)) {
1032  ret += (*it)->text();
1033  }
1034  }
1035  }
1036  } else {
1037  for (; it != itEnd; ++it)
1038  ret += (*it)->text();
1039  }
1040  return ret;
1041 }
1042 
1043 static bool compareTinyTextEntityX(const WordWithCharacters &first, const WordWithCharacters &second)
1044 {
1045  QRect firstArea = first.area().roundedGeometry(1000, 1000);
1046  QRect secondArea = second.area().roundedGeometry(1000, 1000);
1047 
1048  return firstArea.left() < secondArea.left();
1049 }
1050 
1051 static bool compareTinyTextEntityY(const WordWithCharacters &first, const WordWithCharacters &second)
1052 {
1053  const QRect firstArea = first.area().roundedGeometry(1000, 1000);
1054  const QRect secondArea = second.area().roundedGeometry(1000, 1000);
1055 
1056  return firstArea.top() < secondArea.top();
1057 }
1058 
1062 void TextPagePrivate::setWordList(const TextList &list)
1063 {
1064  qDeleteAll(m_words);
1065  m_words = list;
1066 }
1067 
1072 static void removeSpace(TextList *words)
1073 {
1074  TextList::Iterator it = words->begin();
1075  const QString str(QLatin1Char(' '));
1076 
1077  while (it != words->end()) {
1078  if ((*it)->text() == str) {
1079  it = words->erase(it);
1080  } else {
1081  ++it;
1082  }
1083  }
1084 }
1085 
1093 static WordsWithCharacters makeWordFromCharacters(const TextList &characters, int pageWidth, int pageHeight)
1094 {
1106  WordsWithCharacters wordsWithCharacters;
1107 
1108  TextList::ConstIterator it = characters.begin(), itEnd = characters.end(), tmpIt;
1109  int newLeft, newRight, newTop, newBottom;
1110  int index = 0;
1111 
1112  for (; it != itEnd; it++) {
1113  QString textString = (*it)->text();
1114  QString newString;
1115  QRect lineArea = (*it)->area.roundedGeometry(pageWidth, pageHeight), elementArea;
1116  TextList wordCharacters;
1117  tmpIt = it;
1118  int space = 0;
1119 
1120  while (!space) {
1121  if (!textString.isEmpty()) {
1122  newString.append(textString);
1123 
1124  // when textString is the start of the word
1125  if (tmpIt == it) {
1126  NormalizedRect newRect(lineArea, pageWidth, pageHeight);
1127  wordCharacters.append(new TinyTextEntity(textString.normalized(QString::NormalizationForm_KC), newRect));
1128  } else {
1129  NormalizedRect newRect(elementArea, pageWidth, pageHeight);
1130  wordCharacters.append(new TinyTextEntity(textString.normalized(QString::NormalizationForm_KC), newRect));
1131  }
1132  }
1133 
1134  ++it;
1135 
1136  /*
1137  we must have to put this line before the if condition of it==itEnd
1138  otherwise the last character can be missed
1139  */
1140  if (it == itEnd)
1141  break;
1142  elementArea = (*it)->area.roundedGeometry(pageWidth, pageHeight);
1143  if (!doesConsumeY(elementArea, lineArea, 60)) {
1144  --it;
1145  break;
1146  }
1147 
1148  const int text_y1 = elementArea.top(), text_x1 = elementArea.left(), text_y2 = elementArea.y() + elementArea.height(), text_x2 = elementArea.x() + elementArea.width();
1149  const int line_y1 = lineArea.top(), line_x1 = lineArea.left(), line_y2 = lineArea.y() + lineArea.height(), line_x2 = lineArea.x() + lineArea.width();
1150 
1151  space = elementArea.left() - lineArea.right();
1152 
1153  if (space != 0) {
1154  it--;
1155  break;
1156  }
1157 
1158  newLeft = text_x1 < line_x1 ? text_x1 : line_x1;
1159  newRight = line_x2 > text_x2 ? line_x2 : text_x2;
1160  newTop = text_y1 > line_y1 ? line_y1 : text_y1;
1161  newBottom = text_y2 > line_y2 ? text_y2 : line_y2;
1162 
1163  lineArea.setLeft(newLeft);
1164  lineArea.setTop(newTop);
1165  lineArea.setWidth(newRight - newLeft);
1166  lineArea.setHeight(newBottom - newTop);
1167 
1168  textString = (*it)->text();
1169  }
1170 
1171  // if newString is not empty, save it
1172  if (!newString.isEmpty()) {
1173  const NormalizedRect newRect(lineArea, pageWidth, pageHeight);
1174  TinyTextEntity *word = new TinyTextEntity(newString.normalized(QString::NormalizationForm_KC), newRect);
1175  wordsWithCharacters.append(WordWithCharacters(word, wordCharacters));
1176 
1177  index++;
1178  }
1179 
1180  if (it == itEnd)
1181  break;
1182  }
1183 
1184  return wordsWithCharacters;
1185 }
1186 
1190 QList<QPair<WordsWithCharacters, QRect>> makeAndSortLines(const WordsWithCharacters &wordsTmp, int pageWidth, int pageHeight)
1191 {
1204 
1205  /*
1206  Make a new copy of the TextList in the words, so that the wordsTmp and lines do
1207  not contain same pointers for all the TinyTextEntity.
1208  */
1209  QList<WordWithCharacters> words = wordsTmp;
1210 
1211  // Step 1
1212  std::sort(words.begin(), words.end(), compareTinyTextEntityY);
1213 
1214  // Step 2
1215  QList<WordWithCharacters>::Iterator it = words.begin(), itEnd = words.end();
1216 
1217  // for every non-space texts(characters/words) in the textList
1218  for (; it != itEnd; it++) {
1219  const QRect elementArea = (*it).area().roundedGeometry(pageWidth, pageHeight);
1220  bool found = false;
1221 
1222  for (QPair<WordsWithCharacters, QRect> &linesI : lines) {
1223  /* the line area which will be expanded
1224  line_rects is only necessary to preserve the topmin and bottommax of all
1225  the texts in the line, left and right is not necessary at all
1226  */
1227  QRect &lineArea = linesI.second;
1228  const int text_y1 = elementArea.top(), text_y2 = elementArea.top() + elementArea.height(), text_x1 = elementArea.left(), text_x2 = elementArea.left() + elementArea.width();
1229  const int line_y1 = lineArea.top(), line_y2 = lineArea.top() + lineArea.height(), line_x1 = lineArea.left(), line_x2 = lineArea.left() + lineArea.width();
1230 
1231  /*
1232  if the new text and the line has y overlapping parts of more than 70%,
1233  the text will be added to this line
1234  */
1235  if (doesConsumeY(elementArea, lineArea, 70)) {
1236  WordsWithCharacters &line = linesI.first;
1237  line.append(*it);
1238 
1239  const int newLeft = line_x1 < text_x1 ? line_x1 : text_x1;
1240  const int newRight = line_x2 > text_x2 ? line_x2 : text_x2;
1241  const int newTop = line_y1 < text_y1 ? line_y1 : text_y1;
1242  const int newBottom = text_y2 > line_y2 ? text_y2 : line_y2;
1243 
1244  lineArea = QRect(newLeft, newTop, newRight - newLeft, newBottom - newTop);
1245  found = true;
1246  }
1247 
1248  if (found)
1249  break;
1250  }
1251 
1252  /* when we have found a new line create a new TextList containing
1253  only one element and append it to the lines
1254  */
1255  if (!found) {
1256  WordsWithCharacters tmp;
1257  tmp.append((*it));
1258  lines.append(QPair<WordsWithCharacters, QRect>(tmp, elementArea));
1259  }
1260  }
1261 
1262  // Step 3
1263  for (QPair<WordsWithCharacters, QRect> &line : lines) {
1264  WordsWithCharacters &list = line.first;
1265  std::sort(list.begin(), list.end(), compareTinyTextEntityX);
1266  }
1267 
1268  return lines;
1269 }
1270 
1274 static void calculateStatisticalInformation(const QList<WordWithCharacters> &words, int pageWidth, int pageHeight, int *word_spacing, int *line_spacing, int *col_spacing)
1275 {
1286  const QList<QPair<WordsWithCharacters, QRect>> sortedLines = makeAndSortLines(words, pageWidth, pageHeight);
1287 
1291  QMap<int, int> line_space_stat;
1292  for (int i = 0; i < sortedLines.length(); i++) {
1293  const QRect rectUpper = sortedLines.at(i).second;
1294 
1295  if (i + 1 == sortedLines.length())
1296  break;
1297  const QRect rectLower = sortedLines.at(i + 1).second;
1298 
1299  int linespace = rectLower.top() - (rectUpper.top() + rectUpper.height());
1300  if (linespace < 0)
1301  linespace = -linespace;
1302 
1303  if (line_space_stat.contains(linespace))
1304  line_space_stat[linespace]++;
1305  else
1306  line_space_stat[linespace] = 1;
1307  }
1308 
1309  *line_spacing = 0;
1310  int weighted_count = 0;
1311  QMapIterator<int, int> iterate_linespace(line_space_stat);
1312 
1313  while (iterate_linespace.hasNext()) {
1314  iterate_linespace.next();
1315  *line_spacing += iterate_linespace.value() * iterate_linespace.key();
1316  weighted_count += iterate_linespace.value();
1317  }
1318  if (*line_spacing != 0)
1319  *line_spacing = (int)((double)*line_spacing / (double)weighted_count + 0.5);
1320 
1324  // We would like to use QMap instead of QHash as it will keep the keys sorted
1325  QMap<int, int> hor_space_stat;
1326  QMap<int, int> col_space_stat;
1327  QList<QList<QRect>> space_rects;
1328  QVector<QRect> max_hor_space_rects;
1329 
1330  // Space in every line
1331  for (const QPair<WordsWithCharacters, QRect> &sortedLine : sortedLines) {
1332  const WordsWithCharacters list = sortedLine.first;
1333  QList<QRect> line_space_rects;
1334  int maxSpace = 0, minSpace = pageWidth;
1335 
1336  // for every TinyTextEntity element in the line
1337  WordsWithCharacters::ConstIterator it = list.begin(), itEnd = list.end();
1338  QRect max_area1, max_area2;
1339  QString before_max, after_max;
1340 
1341  // for every line
1342  for (; it != itEnd; it++) {
1343  const QRect area1 = (*it).area().roundedGeometry(pageWidth, pageHeight);
1344  if (it + 1 == itEnd)
1345  break;
1346 
1347  const QRect area2 = (*(it + 1)).area().roundedGeometry(pageWidth, pageHeight);
1348  int space = area2.left() - area1.right();
1349 
1350  if (space > maxSpace) {
1351  max_area1 = area1;
1352  max_area2 = area2;
1353  maxSpace = space;
1354  before_max = (*it).text();
1355  after_max = (*(it + 1)).text();
1356  }
1357 
1358  if (space < minSpace && space != 0)
1359  minSpace = space;
1360 
1361  // if we found a real space, whose length is not zero and also less than the pageWidth
1362  if (space != 0 && space != pageWidth) {
1363  // increase the count of the space amount
1364  if (hor_space_stat.contains(space))
1365  hor_space_stat[space]++;
1366  else
1367  hor_space_stat[space] = 1;
1368 
1369  int left, right, top, bottom;
1370 
1371  left = area1.right();
1372  right = area2.left();
1373 
1374  top = area2.top() < area1.top() ? area2.top() : area1.top();
1375  bottom = area2.bottom() > area1.bottom() ? area2.bottom() : area1.bottom();
1376 
1377  QRect rect(left, top, right - left, bottom - top);
1378  line_space_rects.append(rect);
1379  }
1380  }
1381 
1382  space_rects.append(line_space_rects);
1383 
1384  if (hor_space_stat.contains(maxSpace)) {
1385  if (hor_space_stat[maxSpace] != 1)
1386  hor_space_stat[maxSpace]--;
1387  else
1388  hor_space_stat.remove(maxSpace);
1389  }
1390 
1391  if (maxSpace != 0) {
1392  if (col_space_stat.contains(maxSpace))
1393  col_space_stat[maxSpace]++;
1394  else
1395  col_space_stat[maxSpace] = 1;
1396 
1397  // store the max rect of each line
1398  const int left = max_area1.right();
1399  const int right = max_area2.left();
1400  const int top = (max_area1.top() > max_area2.top()) ? max_area2.top() : max_area1.top();
1401  const int bottom = (max_area1.bottom() < max_area2.bottom()) ? max_area2.bottom() : max_area1.bottom();
1402 
1403  const QRect rect(left, top, right - left, bottom - top);
1404  max_hor_space_rects.append(rect);
1405  } else
1406  max_hor_space_rects.append(QRect(0, 0, 0, 0));
1407  }
1408 
1409  // All the between word space counts are in hor_space_stat
1410  *word_spacing = 0;
1411  weighted_count = 0;
1412  QMapIterator<int, int> iterate(hor_space_stat);
1413 
1414  while (iterate.hasNext()) {
1415  iterate.next();
1416 
1417  if (iterate.key() > 0) {
1418  *word_spacing += iterate.value() * iterate.key();
1419  weighted_count += iterate.value();
1420  }
1421  }
1422  if (weighted_count)
1423  *word_spacing = (int)((double)*word_spacing / (double)weighted_count + 0.5);
1424 
1425  *col_spacing = 0;
1426  QMapIterator<int, int> iterate_col(col_space_stat);
1427 
1428  while (iterate_col.hasNext()) {
1429  iterate_col.next();
1430  if (iterate_col.value() > *col_spacing)
1431  *col_spacing = iterate_col.value();
1432  }
1433  *col_spacing = col_space_stat.key(*col_spacing);
1434 
1435  // if there is just one line in a region, there is no point in dividing it
1436  if (sortedLines.length() == 1)
1437  *word_spacing = *col_spacing;
1438 }
1439 
1445 static RegionTextList XYCutForBoundingBoxes(const QList<WordWithCharacters> &wordsWithCharacters, const NormalizedRect &boundingBox, int pageWidth, int pageHeight)
1446 {
1447  RegionTextList tree;
1448  QRect contentRect(boundingBox.geometry(pageWidth, pageHeight));
1449  const RegionText root(wordsWithCharacters, contentRect);
1450 
1451  // start the tree with the root, it is our only region at the start
1452  tree.push_back(root);
1453 
1454  int i = 0;
1455 
1456  // while traversing the tree has not been ended
1457  while (i < tree.length()) {
1458  const RegionText node = tree.at(i);
1459  QRect regionRect = node.area();
1460 
1464  // allocate the size of proj profiles and initialize with 0
1465  int size_proj_y = node.area().height();
1466  int size_proj_x = node.area().width();
1467  // dynamic memory allocation
1468  QVarLengthArray<int> proj_on_xaxis(size_proj_x);
1469  QVarLengthArray<int> proj_on_yaxis(size_proj_y);
1470 
1471  for (int j = 0; j < size_proj_y; ++j)
1472  proj_on_yaxis[j] = 0;
1473  for (int j = 0; j < size_proj_x; ++j)
1474  proj_on_xaxis[j] = 0;
1475 
1476  const QList<WordWithCharacters> list = node.text();
1477 
1478  // Calculate tcx and tcy locally for each new region
1479  int word_spacing, line_spacing, column_spacing;
1480  calculateStatisticalInformation(list, pageWidth, pageHeight, &word_spacing, &line_spacing, &column_spacing);
1481 
1482  const int tcx = word_spacing * 2;
1483  const int tcy = line_spacing * 2;
1484 
1485  int maxX = 0, maxY = 0;
1486  int avgX = 0;
1487  int count;
1488 
1489  // for every text in the region
1490  for (const WordWithCharacters &wwc : list) {
1491  TinyTextEntity *ent = wwc.word;
1492  const QRect entRect = ent->area.geometry(pageWidth, pageHeight);
1493 
1494  // calculate vertical projection profile proj_on_xaxis1
1495  for (int k = entRect.left(); k <= entRect.left() + entRect.width(); ++k) {
1496  if ((k - regionRect.left()) < size_proj_x && (k - regionRect.left()) >= 0)
1497  proj_on_xaxis[k - regionRect.left()] += entRect.height();
1498  }
1499 
1500  // calculate horizontal projection profile in the same way
1501  for (int k = entRect.top(); k <= entRect.top() + entRect.height(); ++k) {
1502  if ((k - regionRect.top()) < size_proj_y && (k - regionRect.top()) >= 0)
1503  proj_on_yaxis[k - regionRect.top()] += entRect.width();
1504  }
1505  }
1506 
1507  for (int j = 0; j < size_proj_y; ++j) {
1508  if (proj_on_yaxis[j] > maxY)
1509  maxY = proj_on_yaxis[j];
1510  }
1511 
1512  avgX = count = 0;
1513  for (int j = 0; j < size_proj_x; ++j) {
1514  if (proj_on_xaxis[j] > maxX)
1515  maxX = proj_on_xaxis[j];
1516  if (proj_on_xaxis[j]) {
1517  count++;
1518  avgX += proj_on_xaxis[j];
1519  }
1520  }
1521  if (count)
1522  avgX /= count;
1523 
1527  int xbegin = 0, xend = size_proj_x - 1;
1528  int ybegin = 0, yend = size_proj_y - 1;
1529  while (xbegin < size_proj_x && proj_on_xaxis[xbegin] <= 0)
1530  xbegin++;
1531  while (xend >= 0 && proj_on_xaxis[xend] <= 0)
1532  xend--;
1533  while (ybegin < size_proj_y && proj_on_yaxis[ybegin] <= 0)
1534  ybegin++;
1535  while (yend >= 0 && proj_on_yaxis[yend] <= 0)
1536  yend--;
1537 
1538  // update the regionRect
1539  int old_left = regionRect.left(), old_top = regionRect.top();
1540  regionRect.setLeft(old_left + xbegin);
1541  regionRect.setRight(old_left + xend);
1542  regionRect.setTop(old_top + ybegin);
1543  regionRect.setBottom(old_top + yend);
1544 
1545  int tnx = (int)((double)avgX * 10.0 / 100.0 + 0.5), tny = 0;
1546  for (int j = 0; j < size_proj_x; ++j)
1547  proj_on_xaxis[j] -= tnx;
1548  for (int j = 0; j < size_proj_y; ++j)
1549  proj_on_yaxis[j] -= tny;
1550 
1554  int gap_hor = -1, pos_hor = -1;
1555  int begin = -1, end = -1;
1556 
1557  // find all hor_gaps and find the maximum between them
1558  for (int j = 1; j < size_proj_y; ++j) {
1559  // transition from white to black
1560  if (begin >= 0 && proj_on_yaxis[j - 1] <= 0 && proj_on_yaxis[j] > 0)
1561  end = j;
1562 
1563  // transition from black to white
1564  if (proj_on_yaxis[j - 1] > 0 && proj_on_yaxis[j] <= 0)
1565  begin = j;
1566 
1567  if (begin > 0 && end > 0 && end - begin > gap_hor) {
1568  gap_hor = end - begin;
1569  pos_hor = (end + begin) / 2;
1570  begin = -1;
1571  end = -1;
1572  }
1573  }
1574 
1575  begin = -1, end = -1;
1576  int gap_ver = -1, pos_ver = -1;
1577 
1578  // find all the ver_gaps and find the maximum between them
1579  for (int j = 1; j < size_proj_x; ++j) {
1580  // transition from white to black
1581  if (begin >= 0 && proj_on_xaxis[j - 1] <= 0 && proj_on_xaxis[j] > 0) {
1582  end = j;
1583  }
1584 
1585  // transition from black to white
1586  if (proj_on_xaxis[j - 1] > 0 && proj_on_xaxis[j] <= 0)
1587  begin = j;
1588 
1589  if (begin > 0 && end > 0 && end - begin > gap_ver) {
1590  gap_ver = end - begin;
1591  pos_ver = (end + begin) / 2;
1592  begin = -1;
1593  end = -1;
1594  }
1595  }
1596 
1597  int cut_pos_x = pos_ver, cut_pos_y = pos_hor;
1598  int gap_x = gap_ver, gap_y = gap_hor;
1599 
1603  bool cut_hor = false, cut_ver = false;
1604 
1605  // For horizontal cut
1606  const int topHeight = cut_pos_y - (regionRect.top() - old_top);
1607  const QRect topRect(regionRect.left(), regionRect.top(), regionRect.width(), topHeight);
1608  const QRect bottomRect(regionRect.left(), regionRect.top() + topHeight, regionRect.width(), regionRect.height() - topHeight);
1609 
1610  // For vertical Cut
1611  const int leftWidth = cut_pos_x - (regionRect.left() - old_left);
1612  const QRect leftRect(regionRect.left(), regionRect.top(), leftWidth, regionRect.height());
1613  const QRect rightRect(regionRect.left() + leftWidth, regionRect.top(), regionRect.width() - leftWidth, regionRect.height());
1614 
1615  if (gap_y >= gap_x && gap_y >= tcy)
1616  cut_hor = true;
1617  else if (gap_y >= gap_x && gap_y <= tcy && gap_x >= tcx)
1618  cut_ver = true;
1619  else if (gap_x >= gap_y && gap_x >= tcx)
1620  cut_ver = true;
1621  else if (gap_x >= gap_y && gap_x <= tcx && gap_y >= tcy)
1622  cut_hor = true;
1623  // no cut possible
1624  else {
1625  // we can now update the node rectangle with the shrinked rectangle
1626  RegionText tmpNode = tree.at(i);
1627  tmpNode.setArea(regionRect);
1628  tree.replace(i, tmpNode);
1629  i++;
1630  continue;
1631  }
1632 
1633  WordsWithCharacters list1, list2;
1634 
1635  // horizontal cut, topRect and bottomRect
1636  if (cut_hor) {
1637  for (const WordWithCharacters &word : list) {
1638  const QRect wordRect = word.area().geometry(pageWidth, pageHeight);
1639 
1640  if (topRect.intersects(wordRect))
1641  list1.append(word);
1642  else
1643  list2.append(word);
1644  }
1645 
1646  RegionText node1(list1, topRect);
1647  RegionText node2(list2, bottomRect);
1648 
1649  tree.replace(i, node1);
1650  tree.insert(i + 1, node2);
1651  }
1652 
1653  // vertical cut, leftRect and rightRect
1654  else if (cut_ver) {
1655  for (const WordWithCharacters &word : list) {
1656  const QRect wordRect = word.area().geometry(pageWidth, pageHeight);
1657 
1658  if (leftRect.intersects(wordRect))
1659  list1.append(word);
1660  else
1661  list2.append(word);
1662  }
1663 
1664  RegionText node1(list1, leftRect);
1665  RegionText node2(list2, rightRect);
1666 
1667  tree.replace(i, node1);
1668  tree.insert(i + 1, node2);
1669  }
1670  }
1671 
1672  return tree;
1673 }
1674 
1678 WordsWithCharacters addNecessarySpace(RegionTextList tree, int pageWidth, int pageHeight)
1679 {
1686  // Only change the texts under RegionTexts, not the area
1687  for (RegionText &tmpRegion : tree) {
1688  // Step 01
1689  QList<QPair<WordsWithCharacters, QRect>> sortedLines = makeAndSortLines(tmpRegion.text(), pageWidth, pageHeight);
1690 
1691  // Step 02
1692  for (QPair<WordsWithCharacters, QRect> &sortedLine : sortedLines) {
1693  WordsWithCharacters &list = sortedLine.first;
1694  for (int k = 0; k < list.length(); k++) {
1695  const QRect area1 = list.at(k).area().roundedGeometry(pageWidth, pageHeight);
1696  if (k + 1 >= list.length())
1697  break;
1698 
1699  const QRect area2 = list.at(k + 1).area().roundedGeometry(pageWidth, pageHeight);
1700  const int space = area2.left() - area1.right();
1701 
1702  if (space != 0) {
1703  // Make a TinyTextEntity of string space and push it between it and it+1
1704  const int left = area1.right();
1705  const int right = area2.left();
1706  const int top = area2.top() < area1.top() ? area2.top() : area1.top();
1707  const int bottom = area2.bottom() > area1.bottom() ? area2.bottom() : area1.bottom();
1708 
1709  const QString spaceStr(QStringLiteral(" "));
1710  const QRect rect(QPoint(left, top), QPoint(right, bottom));
1711  const NormalizedRect entRect(rect, pageWidth, pageHeight);
1712  TinyTextEntity *ent1 = new TinyTextEntity(spaceStr, entRect);
1713  TinyTextEntity *ent2 = new TinyTextEntity(spaceStr, entRect);
1714  WordWithCharacters word(ent1, QList<TinyTextEntity *>() << ent2);
1715 
1716  list.insert(k + 1, word);
1717 
1718  // Skip the space
1719  k++;
1720  }
1721  }
1722  }
1723 
1724  WordsWithCharacters tmpList;
1725  for (const QPair<WordsWithCharacters, QRect> &sortedLine : qAsConst(sortedLines)) {
1726  tmpList += sortedLine.first;
1727  }
1728  tmpRegion.setText(tmpList);
1729  }
1730 
1731  // Step 03
1732  WordsWithCharacters tmp;
1733  for (const RegionText &tmpRegion : qAsConst(tree)) {
1734  tmp += tmpRegion.text();
1735  }
1736  return tmp;
1737 }
1738 
1742 void TextPagePrivate::correctTextOrder()
1743 {
1744  // m_page->width() and m_page->height() are in pixels at
1745  // 100% zoom level, and thus depend on display DPI.
1746  // To avoid Okular failing on lowDPI displays,
1747  // we scale pageWidth and pageHeight so their sum equals 2000.
1748  const double scalingFactor = 2000.0 / (m_page->width() + m_page->height());
1749  const int pageWidth = (int)(scalingFactor * m_page->width());
1750  const int pageHeight = (int)(scalingFactor * m_page->height());
1751 
1752  TextList characters = m_words;
1753 
1757  removeSpace(&characters);
1758 
1762  const QList<WordWithCharacters> wordsWithCharacters = makeWordFromCharacters(characters, pageWidth, pageHeight);
1763 
1767  const RegionTextList tree = XYCutForBoundingBoxes(wordsWithCharacters, m_page->boundingBox(), pageWidth, pageHeight);
1768 
1772  const WordsWithCharacters listWithWordsAndSpaces = addNecessarySpace(tree, pageWidth, pageHeight);
1773 
1777  TextList listOfCharacters;
1778  for (const WordWithCharacters &word : listWithWordsAndSpaces) {
1779  delete word.word;
1780  listOfCharacters.append(word.characters);
1781  }
1782  setWordList(listOfCharacters);
1783 }
1784 
1786 {
1787  if (area && area->isNull())
1788  return TextEntity::List();
1789 
1790  TextEntity::List ret;
1791  if (area) {
1792  for (const TinyTextEntity *te : qAsConst(d->m_words)) {
1794  if (area->intersects(te->area)) {
1795  ret.append(new TextEntity(te->text(), new Okular::NormalizedRect(te->area)));
1796  }
1797  } else {
1798  const NormalizedPoint center = te->area.center();
1799  if (area->contains(center.x, center.y)) {
1800  ret.append(new TextEntity(te->text(), new Okular::NormalizedRect(te->area)));
1801  }
1802  }
1803  }
1804  } else {
1805  for (const TinyTextEntity *te : qAsConst(d->m_words)) {
1806  ret.append(new TextEntity(te->text(), new Okular::NormalizedRect(te->area)));
1807  }
1808  }
1809  return ret;
1810 }
1811 
1813 {
1814  TextList::ConstIterator itBegin = d->m_words.constBegin(), itEnd = d->m_words.constEnd();
1815  TextList::ConstIterator it = itBegin;
1816  TextList::ConstIterator posIt = itEnd;
1817  for (; it != itEnd; ++it) {
1818  if ((*it)->area.contains(p.x, p.y)) {
1819  posIt = it;
1820  break;
1821  }
1822  }
1823  QString text;
1824  if (posIt != itEnd) {
1825  if ((*posIt)->text().simplified().isEmpty()) {
1826  return nullptr;
1827  }
1828  // Find the first TinyTextEntity of the word
1829  while (posIt != itBegin) {
1830  --posIt;
1831  const QString itText = (*posIt)->text();
1832  if (itText.right(1).at(0).isSpace()) {
1833  if (itText.endsWith(QLatin1String("-\n"))) {
1834  // Is an hyphenated word
1835  // continue searching the start of the word back
1836  continue;
1837  }
1838 
1839  if (itText == QLatin1String("\n") && posIt != itBegin) {
1840  --posIt;
1841  if ((*posIt)->text().endsWith(QLatin1String("-"))) {
1842  // Is an hyphenated word
1843  // continue searching the start of the word back
1844  continue;
1845  }
1846  ++posIt;
1847  }
1848 
1849  ++posIt;
1850  break;
1851  }
1852  }
1853  RegularAreaRect *ret = new RegularAreaRect();
1854  for (; posIt != itEnd; ++posIt) {
1855  const QString itText = (*posIt)->text();
1856  if (itText.simplified().isEmpty()) {
1857  break;
1858  }
1859 
1860  ret->appendShape((*posIt)->area);
1861  text += (*posIt)->text();
1862  if (itText.right(1).at(0).isSpace()) {
1863  if (!text.endsWith(QLatin1String("-\n"))) {
1864  break;
1865  }
1866  }
1867  }
1868 
1869  if (word) {
1870  *word = text;
1871  }
1872  return ret;
1873  } else {
1874  return nullptr;
1875  }
1876 }
SearchDirection
Describes the direction of searching.
Definition: global.h:38
NormalizedPoint is a helper class which stores the coordinates of a normalized point.
Definition: area.h:119
QString::const_iterator constBegin() const const
QTextStream & center(QTextStream &s)
void setBottom(int y)
NormalizedRect * area() const
Returns the bounding area of the text entity.
Definition: textpage.cpp:201
QString & append(QChar ch)
const QChar * constData() const const
Searching for the next result on the page, earlier result should be located so we search from the las...
Definition: global.h:41
bool contains(const Key &key) const const
int direction() const
Returns the direction of the selection.
Definition: misc.cpp:66
void append(const T &value)
bool isRight(const NormalizedPoint &pt) const
Returns true if the point pt is located to the left of the right edge of the rectangle.
Definition: area.h:374
int right() const const
MergeSide
The side(s) to be considered when merging areas.
Definition: global.h:66
TextPage()
Creates a new text page.
Definition: textpage.cpp:224
int length() const const
QString text() const
Returns the text of the text entity.
Definition: textpage.cpp:196
void transform(const QTransform &matrix)
Transforms the normalized rectangle with the operations defined by matrix.
Definition: area.cpp:253
bool contains(double x, double y) const
Returns whether this area contains the normalized point (x, y).
Definition: area.h:852
const T & at(int i) const const
double left
The normalized left coordinate.
Definition: area.h:416
QString simplified() const const
A NormalizedRect is a rectangle which can be defined by two NormalizedPoints.
Definition: area.h:191
void appendShape(const NormalizedShape &shape, MergeSide side=MergeAll)
Appends the given shape to this area.
Definition: area.h:797
QString text(const RegularAreaRect *area=nullptr) const
Text extraction function.
Definition: textpage.cpp:1011
This is a list of NormalizedRect, to describe an area consisting of multiple rectangles using normali...
Definition: area.h:911
Merge only if the right side of the first area intersect.
Definition: global.h:67
int height() const const
double y
The normalized y coordinate.
Definition: area.h:174
int x() const const
int y() const const
T & first()
bool intersects(const RegularArea< NormalizedShape, Shape > *area) const
Returns whether this area intersects with the given area.
Definition: area.h:773
global.h
Definition: action.h:19
NormalizedPoint start() const
Returns the start point of the selection.
Definition: misc.cpp:71
bool intersects(const NormalizedRect &other) const
Returns whether the normalized rectangle intersects the other normalized rectangle.
Definition: area.cpp:170
int x() const const
int y() const const
Searching from top of the page, next result is to be found, there was no earlier search result...
Definition: global.h:39
QString normalized(QString::NormalizationForm mode, QChar::UnicodeVersion version) const const
Searching from bottom of the page, next result is to be found, there was no earlier search result...
Definition: global.h:40
Represents a piece of text on a TextPage, containing its textual representation and its bounding box...
Definition: textpage.h:54
NormalizedRect transformedArea(const QTransform &matrix) const
Returns the transformed area of the text entity.
Definition: textpage.cpp:206
bool contains(double x, double y) const
Returns whether the normalized rectangle contains the normalized point (x, y).
Definition: area.cpp:165
double right
The normalized right coordinate.
Definition: area.h:426
bool isNull() const
Returns whether the regular area is a null area.
Definition: area.h:747
QString::const_iterator constEnd() const const
Searching for the previous result on the page, earlier result should be located so we search from the...
Definition: global.h:42
QString fromRawData(const QChar *unicode, int size)
void append(const T &value)
bool isSpace() const const
int top() const const
QMapIterator::Item next()
void setTop(int y)
int left() const const
CaseInsensitive
void setWidth(int width)
bool isTopOrLevel(const NormalizedPoint &pt) const
Returns true if the point pt is located above the bottom of the rectangle.
Definition: area.h:356
bool isEmpty() const const
bool startsWith(const QString &s, Qt::CaseSensitivity cs) const const
void end(const NormalizedPoint &point)
Changes the end point of the selection to the given point.
Definition: misc.cpp:45
bool endsWith(const QString &s, Qt::CaseSensitivity cs) const const
QPoint center() const const
T & first()
~TextEntity()
Destroys the text entity.
Definition: textpage.cpp:191
const Key & key() const const
bool isBottom(const NormalizedPoint &pt) const
Returns true if the point pt is located below the bottom of the rectangle.
Definition: area.h:329
const T & value() const const
QRect geometry(int xScale, int yScale) const
Returns the rectangle mapped to a reference area of xScale x yScale.
Definition: area.cpp:239
int compare(const QString &other, Qt::CaseSensitivity cs) const const
QString right(int n) const const
ushort unicode() const const
QList::iterator end()
bool isBottomOrLevel(const NormalizedPoint &pt) const
Returns true if the point pt is located below the top of the rectangle.
Definition: area.h:347
QStringRef midRef(int position, int n) const const
double top
The normalized top coordinate.
Definition: area.h:421
const Key key(const T &value, const Key &defaultKey) const const
TextEntity::List words(const RegularAreaRect *area, TextAreaInclusionBehaviour b) const
Text entity extraction function.
Definition: textpage.cpp:1785
void setRight(int x)
RegularAreaRect * textArea(TextSelection *selection) const
Returns the rectangular area of the given selection.
Definition: textpage.cpp:325
double x
The normalized x coordinate.
Definition: area.h:169
bool isTop(const NormalizedPoint &pt) const
Returns true if the point pt is located above the top of the rectangle.
Definition: area.h:338
int width() const const
A character is included into text() result if any pixel of his bounding box is in the given area...
Definition: textpage.h:121
TextAreaInclusionBehaviour
Defines the behaviour of adding characters to text() result.
Definition: textpage.h:120
void insert(int i, const T &value)
void setHeight(int height)
const QChar at(int position) const const
void simplify()
Simplifies this regular area by merging its intersecting subareas.
Definition: area.h:726
typedef ConstIterator
int bottom() const const
int length() const const
NormalizationForm_KC
double bottom
The normalized bottom coordinate.
Definition: area.h:431
bool isLeft(const NormalizedPoint &pt) const
Returns true if the point pt is located to the right of the left edge of the rectangle.
Definition: area.h:365
QList::const_iterator constEnd() const const
RegularAreaRect * wordAt(const NormalizedPoint &p, QString *word=nullptr) const
Returns the area and text of the word at the given point Note that ownership of the returned area bel...
Definition: textpage.cpp:1812
QList::const_iterator constBegin() const const
TextEntity(const QString &text, NormalizedRect *area)
Creates a new text entity with the given text and the given area.
Definition: textpage.cpp:184
~TextPage()
Destroys the text page.
Definition: textpage.cpp:241
void setLeft(int x)
Wrapper around the information needed to generate the selection area There are two assumptions inside...
Definition: misc.h:36
QList::iterator begin()
void append(const QString &text, NormalizedRect *area)
Appends the given text with the given area as new TextEntity to the page.
Definition: textpage.cpp:246
bool hasNext() const const
int remove(const Key &key)
RegularAreaRect * findText(int searchID, const QString &query, SearchDirection direction, Qt::CaseSensitivity caseSensitivity, const RegularAreaRect *area)
Returns the bounding rect of the text which matches the following criteria or 0 if the search is not ...
Definition: textpage.cpp:693
This file is part of the KDE documentation.
Documentation copyright © 1996-2020 The KDE developers.
Generated on Sat Oct 24 2020 22:41:38 by doxygen 1.8.11 written by Dimitri van Heesch, © 1997-2006

KDE's Doxygen guidelines are available online.