Okular

textpage.cpp
1/*
2 SPDX-FileCopyrightText: 2005 Piotr Szymanski <niedakh@gmail.com>
3
4 SPDX-License-Identifier: GPL-2.0-or-later
5*/
6
7#include "textpage.h"
8#include "textpage_p.h"
9
10#include <QDebug>
11
12#include "area.h"
13#include "debug_p.h"
14#include "misc.h"
15#include "page.h"
16#include "page_p.h"
17#include <unordered_set>
18
19#include <cstring>
20
21#include <QVarLengthArray>
22#include <QtAlgorithms>
23
24using namespace Okular;
25
26// Many of the strings are being reused; especially
27// those less than 2 letters are very common
28// Use the implicit shared bits from QString to
29// not keep multiple same strings around, but just up the
30// refcount a bit
31// The main reason for '2' is that most calls here happens
32// in auxillary threads that are following a document
33// and keeping the pool thread_local gives quite a bit
34// of advantage here.
35// Some calls though comes from the main thread, so we
36// shouldn't keep all the long strings allocated in the main
37// thread around forever.
38// '2' has been chosen by random testing, and guesswork.
39static QString fromPool(const QString &str)
40{
41 if (str.length() > 2) {
42 return str;
43 }
44
45 thread_local std::unordered_set<QString> pool;
46 auto [iterator, success] = pool.insert(str);
47 return *iterator;
48}
49
50class SearchPoint
51{
52public:
53 SearchPoint()
54 : offset_begin(-1)
55 , offset_end(-1)
56 {
57 }
58
59 /** The TextEntity containing the first character of the match. */
61
62 /** The TextEntity containing the last character of the match. */
64
65 /** The index of the first character of the match in (*it_begin)->text().
66 * Satisfies 0 <= offset_begin < (*it_begin)->text().length().
67 */
68 int offset_begin;
69
70 /** One plus the index of the last character of the match in (*it_end)->text().
71 * Satisfies 0 < offset_end <= (*it_end)->text().length().
72 */
73 int offset_end;
74};
75
76/* text comparison functions */
77
78static bool CaseInsensitiveCmpFn(QStringView from, QStringView to)
79{
80#ifdef DEBUG_TEXTPAGE
81 qDebug(OkularCoreDebug) << from << ":" << to << "(case insensitive)";
82#endif
83 return from.compare(to, Qt::CaseInsensitive) == 0;
84}
85
86static bool CaseSensitiveCmpFn(QStringView from, QStringView to)
87{
88#ifdef DEBUG_TEXTPAGE
89 qDebug(OkularCoreDebug) << from << ":" << to << "(case sensitive)";
90#endif
91 return from.compare(to, Qt::CaseSensitive) == 0;
92}
93
94/**
95 * Returns true iff segments [@p left1, @p right1] and [@p left2, @p right2] on the real line
96 * overlap within @p threshold percent, i. e. iff the ratio of the length of the
97 * intersection of the segments to the length of the shortest of the two input segments
98 * is not smaller than the threshold.
99 */
100static bool segmentsOverlap(double left1, double right1, double left2, double right2, int threshold)
101{
102 // check if one consumes another fully (speed optimization)
103
104 if (left1 <= left2 && right1 >= right2) {
105 return true;
106 }
107
108 if (left1 >= left2 && right1 <= right2) {
109 return true;
110 }
111
112 // check if there is overlap above threshold
113 if (right2 >= left1 && right1 >= left2) {
114 double overlap = (right2 >= right1) ? right1 - left2 : right2 - left1;
115
116 double length1 = right1 - left1, length2 = right2 - left2;
117
118 return overlap * 100 >= threshold * qMin(length1, length2);
119 }
120
121 return false;
122}
123
124static bool doesConsumeY(const QRect first, const QRect second, int threshold)
125{
126 return segmentsOverlap(first.top(), first.bottom(), second.top(), second.bottom(), threshold);
127}
128
129static bool doesConsumeY(const NormalizedRect &first, const NormalizedRect &second, int threshold)
130{
131 return segmentsOverlap(first.top, first.bottom, second.top, second.bottom, threshold);
132}
133
135 : m_text(fromPool(text))
136 , m_area(area)
137{
138}
139
140TextEntity::~TextEntity() = default;
141
143{
144 return m_text;
145}
146
148{
149 return m_area;
150}
151
153{
154 NormalizedRect transformed_area = m_area;
155 transformed_area.transform(matrix);
156 return transformed_area;
157}
158
159TextPagePrivate::TextPagePrivate()
160 : m_page(nullptr)
161{
162}
163
164TextPagePrivate::~TextPagePrivate()
165{
166 qDeleteAll(m_searchPoints);
167}
168
170 : d(new TextPagePrivate())
171{
172}
173
175 : d(new TextPagePrivate())
176{
177 d->m_words = words;
178}
179
181{
182 delete d;
183}
184
185void TextPage::append(const QString &text, const NormalizedRect &area)
186{
187 if (!text.isEmpty()) {
188 if (!d->m_words.isEmpty()) {
189 TextEntity &lastEntity = d->m_words.last();
190 const QString concatText = lastEntity.text() + text.normalized(QString::NormalizationForm_KC);
191 if (concatText != concatText.normalized(QString::NormalizationForm_KC)) {
192 // If this happens it means that the new text + old one have combined, for example A and ◌̊ form Å
193 NormalizedRect newArea = area | lastEntity.area();
194 d->m_words.removeLast();
195 d->m_words.append(TextEntity(concatText.normalized(QString::NormalizationForm_KC), newArea));
196 return;
197 }
198 }
199
200 d->m_words.append(TextEntity(text.normalized(QString::NormalizationForm_KC), area));
201 }
202}
203
204struct WordWithCharacters {
205 WordWithCharacters(const TextEntity &w, const TextEntity::List &c)
206 : word(w)
207 , characters(c)
208 {
209 }
210
211 inline QString text() const
212 {
213 return word.text();
214 }
215
216 inline NormalizedRect area() const
217 {
218 return word.area();
219 }
220
221 TextEntity word;
222 TextEntity::List characters;
223};
225
226/**
227 * We will divide the whole page in some regions depending on the horizontal and
228 * vertical spacing among different regions. Each region will have an area and an
229 * associated WordsWithCharacters in sorted order.
230 */
231class RegionText
232{
233public:
234 RegionText() {};
235
236 RegionText(const WordsWithCharacters &wordsWithCharacters, const QRect area)
237 : m_region_wordWithCharacters(wordsWithCharacters)
238 , m_area(area)
239 {
240 }
241
242 inline QString string() const
243 {
244 QString res;
245 for (const WordWithCharacters &word : m_region_wordWithCharacters) {
246 res += word.text();
247 }
248 return res;
249 }
250
251 inline WordsWithCharacters text() const
252 {
253 return m_region_wordWithCharacters;
254 }
255
256 inline QRect area() const
257 {
258 return m_area;
259 }
260
261 inline void setArea(const QRect area)
262 {
263 m_area = area;
264 }
265
266 inline void setText(const WordsWithCharacters &wordsWithCharacters)
267 {
268 m_region_wordWithCharacters = wordsWithCharacters;
269 }
270
271private:
272 WordsWithCharacters m_region_wordWithCharacters;
273 QRect m_area;
274};
275
277{
278 if (d->m_words.isEmpty()) {
279 return new RegularAreaRect();
280 }
281
282 /**
283 It works like this:
284 There are two cursors, we need to select all the text between them. The coordinates are normalised, leftTop is (0,0)
285 rightBottom is (1,1), so for cursors start (sx,sy) and end (ex,ey) we start with finding text rectangles under those
286 points, if not we search for the first that is to the right to it in the same baseline, if none found, then we search
287 for the first rectangle with a baseline under the cursor, having two points that are the best rectangles to both
288 of the cursors: (rx,ry)x(tx,ty) for start and (ux,uy)x(vx,vy) for end, we do a
289 1. (rx,ry)x(1,ty)
290 2. (0,ty)x(1,uy)
291 3. (0,uy)x(vx,vy)
292
293 To find the closest rectangle to cursor (cx,cy) we search for a rectangle that either contains the cursor
294 or that has a left border >= cx and bottom border >= cy.
295 */
297
298 PagePrivate *pagePrivate = PagePrivate::get(d->m_page);
299 const QTransform matrix = pagePrivate ? pagePrivate->rotationMatrix() : QTransform();
300 const double scaleX = d->m_page->width();
301 const double scaleY = d->m_page->height();
302
303 NormalizedPoint startC = sel->start();
304 NormalizedPoint endC = sel->end();
305 NormalizedPoint temp;
306
307 // if startPoint is right to endPoint swap them
308 if (startC.x > endC.x) {
309 temp = startC;
310 startC = endC;
311 endC = temp;
312 }
313
314 // minX,maxX,minY,maxY gives the bounding rectangle coordinates of the document
315 const NormalizedRect boundingRect = d->m_page->boundingBox();
316 const QRect content = boundingRect.geometry(scaleX, scaleY);
317 const double minX = content.left();
318 const double maxX = content.right();
319 const double minY = content.top();
320 const double maxY = content.bottom();
321
322 /**
323 * We will now find out the TinyTextEntity for the startRectangle and TinyTextEntity for
324 * the endRectangle. We have four cases:
325 *
326 * Case 1(a): both startpoint and endpoint are out of the bounding Rectangle and at one side, so the rectangle made of start
327 * and endPoint are outof the bounding rect (do not intersect)
328 *
329 * Case 1(b): both startpoint and endpoint are out of bounding rect, but they are in different side, so is their rectangle
330 *
331 * Case 2(a): find the rectangle which contains start and endpoint and having some
332 * TextEntity
333 *
334 * Case 2(b): if 2(a) fails (if startPoint and endPoint both are unchanged), then we check whether there is any
335 * TextEntity within the rect made by startPoint and endPoint
336 *
337 * Case 3: Now, we may have two type of selection.
338 * 1. startpoint is left-top of start_end and endpoint is right-bottom
339 * 2. startpoint is left-bottom of start_end and endpoint is top-right
340 *
341 * Also, as 2(b) is passed, we might have it,itEnd or both unchanged, but the fact is that we have
342 * text within them. so, we need to search for the best suitable textposition for start and end.
343 *
344 * Case 3(a): We search the nearest rectangle consisting of some
345 * TinyTextEntity right to or bottom of the startPoint for selection 01.
346 * And, for selection 02, we have to search for right and top
347 *
348 * Case 3(b): For endpoint, we have to find the point top of or left to
349 * endpoint if we have selection 01.
350 * Otherwise, the search will be left and bottom
351 */
352
353 // we know that startC.x > endC.x, we need to decide which is top and which is bottom
354 const NormalizedRect start_end = (startC.y < endC.y) ? NormalizedRect(startC.x, startC.y, endC.x, endC.y) : NormalizedRect(startC.x, endC.y, endC.x, startC.y);
355
356 // Case 1(a)
357 if (!boundingRect.intersects(start_end)) {
358 return ret;
359 } else {
360 // case 1(b)
361 /**
362 note that, after swapping of start and end, we know that,
363 start is always left to end. but, we cannot say start is
364 positioned upper than end.
365 **/
366 // if start is left to content rect take it to content rect boundary
367 if (startC.x * scaleX < minX) {
368 startC.x = minX / scaleX;
369 }
370 if (endC.x * scaleX > maxX) {
371 endC.x = maxX / scaleX;
372 }
373
374 // if start is top to end (selection type 01)
375 if (startC.y * scaleY < minY) {
376 startC.y = minY / scaleY;
377 }
378 if (endC.y * scaleY > maxY) {
379 endC.y = maxY / scaleY;
380 }
381
382 // if start is bottom to end (selection type 02)
383 if (startC.y * scaleY > maxY) {
384 startC.y = maxY / scaleY;
385 }
386 if (endC.y * scaleY < minY) {
387 endC.y = minY / scaleY;
388 }
389 }
390
391 TextEntity::List::ConstIterator it = d->m_words.constBegin(), itEnd = d->m_words.constEnd();
392 TextEntity::List::ConstIterator start = it, end = itEnd, tmpIt = it; //, tmpItEnd = itEnd;
393 const MergeSide side = d->m_page ? (MergeSide)d->m_page->totalOrientation() : MergeRight;
394
395 NormalizedRect tmp;
396 // case 2(a)
397 for (; it != itEnd; ++it) {
398 tmp = it->area();
399 if (tmp.contains(startC.x, startC.y)) {
400 start = it;
401 }
402 if (tmp.contains(endC.x, endC.y)) {
403 end = it;
404 }
405 }
406
407 // case 2(b)
408 it = tmpIt;
409 if (start == it && end == itEnd) {
410 for (; it != itEnd; ++it) {
411 // is there any text rectangle within the start_end rect
412 tmp = it->area();
413 if (start_end.intersects(tmp)) {
414 break;
415 }
416 }
417
418 // we have searched every text entities, but none is within the rectangle created by start and end
419 // so, no selection should be done
420 if (it == itEnd) {
421 return ret;
422 }
423 }
424 it = tmpIt;
425 bool selection_two_start = false;
426
427 // case 3.a
428 if (start == it) {
429 NormalizedRect rect;
430
431 // selection type 01
432 if (startC.y <= endC.y) {
433 for (; it != itEnd; ++it) {
434 rect = it->area();
435 bool flagV = !rect.isBottom(startC);
436
437 if (flagV && rect.isRight(startC)) {
438 start = it;
439 break;
440 }
441 }
442 }
443
444 // selection type 02
445 else {
446 selection_two_start = true;
447 int distance = scaleX + scaleY + 100;
448
449 for (; it != itEnd; ++it) {
450 rect = it->area();
451
452 if (rect.isBottomOrLevel(startC) && rect.isRight(startC)) {
453 QRect entRect = rect.geometry(scaleX, scaleY);
454 int xdist, ydist;
455 xdist = entRect.center().x() - startC.x * scaleX;
456 ydist = entRect.center().y() - startC.y * scaleY;
457
458 // make them positive
459 if (xdist < 0) {
460 xdist = -xdist;
461 }
462 if (ydist < 0) {
463 ydist = -ydist;
464 }
465
466 if ((xdist + ydist) < distance) {
467 distance = xdist + ydist;
468 start = it;
469 }
470 }
471 }
472 }
473 }
474
475 // case 3.b
476 if (end == itEnd) {
477 it = tmpIt;
478 itEnd = itEnd - 1;
479
480 NormalizedRect rect;
481
482 if (startC.y <= endC.y) {
483 for (; itEnd >= it; itEnd--) {
484 rect = itEnd->area();
485 bool flagV = !rect.isTop(endC);
486
487 if (flagV && rect.isLeft(endC)) {
488 end = itEnd;
489 break;
490 }
491 }
492 }
493
494 else {
495 int distance = scaleX + scaleY + 100;
496 for (; itEnd >= it; itEnd--) {
497 rect = itEnd->area();
498
499 if (rect.isTopOrLevel(endC) && rect.isLeft(endC)) {
500 QRect entRect = rect.geometry(scaleX, scaleY);
501 int xdist, ydist;
502 xdist = entRect.center().x() - endC.x * scaleX;
503 ydist = entRect.center().y() - endC.y * scaleY;
504
505 // make them positive
506 if (xdist < 0) {
507 xdist = -xdist;
508 }
509 if (ydist < 0) {
510 ydist = -ydist;
511 }
512
513 if ((xdist + ydist) < distance) {
514 distance = xdist + ydist;
515 end = itEnd;
516 }
517 }
518 }
519 }
520 }
521
522 /* if start and end in selection 02 are in the same column, and we
523 start at an empty space we have to remove the selection of last
524 character
525 */
526 if (selection_two_start) {
527 if (start > end) {
528 start = start - 1;
529 }
530 }
531
532 // if start is less than end swap them
533 if (start > end) {
534 it = start;
535 start = end;
536 end = it;
537 }
538
539 // removes the possibility of crash, in case none of 1 to 3 is true
540 if (end == d->m_words.constEnd()) {
541 end--;
542 }
543
544 for (; start <= end; start++) {
545 ret->appendShape(start->transformedArea(matrix), side);
546 }
547
548 return ret;
549}
550
551RegularAreaRect *TextPage::findText(int searchID, const QString &query, SearchDirection direct, Qt::CaseSensitivity caseSensitivity, const RegularAreaRect *area)
552{
553 SearchDirection dir = direct;
554 // invalid search request
555 if (d->m_words.isEmpty() || query.isEmpty() || (area && area->isNull())) {
556 return nullptr;
557 }
559 int start_offset = 0;
561 const QMap<int, SearchPoint *>::const_iterator sIt = d->m_searchPoints.constFind(searchID);
562 if (sIt == d->m_searchPoints.constEnd()) {
563 // if no previous run of this search is found, then set it to start
564 // from the beginning (respecting the search direction)
565 if (dir == NextResult) {
566 dir = FromTop;
567 } else if (dir == PreviousResult) {
568 dir = FromBottom;
569 }
570 }
571 bool forward = true;
572 switch (dir) {
573 case FromTop:
574 start = d->m_words.constBegin();
575 start_offset = 0;
576 end = d->m_words.constEnd();
577 break;
578 case FromBottom:
579 start = d->m_words.constEnd();
580 start_offset = 0;
581 end = d->m_words.constBegin();
582 forward = false;
583 break;
584 case NextResult:
585 start = (*sIt)->it_end;
586 start_offset = (*sIt)->offset_end;
587 end = d->m_words.constEnd();
588 break;
589 case PreviousResult:
590 start = (*sIt)->it_begin;
591 start_offset = (*sIt)->offset_begin;
592 end = d->m_words.constBegin();
593 forward = false;
594 break;
595 };
596 RegularAreaRect *ret = nullptr;
597 const TextComparisonFunction cmpFn = caseSensitivity == Qt::CaseSensitive ? CaseSensitiveCmpFn : CaseInsensitiveCmpFn;
598 if (forward) {
599 ret = d->findTextInternalForward(searchID, query, cmpFn, start, start_offset, end);
600 } else {
601 ret = d->findTextInternalBackward(searchID, query, cmpFn, start, start_offset, end);
602 }
603 return ret;
604}
605
606// hyphenated '-' must be at the end of a word, so hyphenation means
607// we have a '-' just followed by a '\n' character
608// check if the string contains a '-' character
609// if the '-' is the last entry
610static int stringLengthAdaptedWithHyphen(const QString &str, TextEntity::List::ConstIterator it, TextEntity::List::ConstIterator textListEnd)
611{
612 const int len = str.length();
613
614 // hyphenated '-' must be at the end of a word, so hyphenation means
615 // we have a '-' just followed by a '\n' character
616 // check if the string contains a '-' character
617 // if the '-' is the last entry
618 if (str.endsWith(QLatin1Char('-'))) {
619 // validity chek of it + 1
620 if ((it + 1) != textListEnd) {
621 // 1. if the next character is '\n'
622 const QString &lookahedStr = (it + 1)->text();
623 if (lookahedStr.startsWith(QLatin1Char('\n'))) {
624 return len - 1;
625 }
626
627 // 2. if the next word is in a different line or not
628 const NormalizedRect &hyphenArea = it->area();
629 const NormalizedRect &lookaheadArea = (it + 1)->area();
630
631 // lookahead to check whether both the '-' rect and next character rect overlap
632 if (!doesConsumeY(hyphenArea, lookaheadArea, 70)) {
633 return len - 1;
634 }
635 }
636 }
637 // else if it is the second last entry - for example in pdf format
638 else if (str.endsWith(QLatin1String("-\n"))) {
639 return len - 2;
640 }
641
642 return len;
643}
644
645RegularAreaRect *TextPagePrivate::searchPointToArea(const SearchPoint *sp)
646{
647 PagePrivate *pagePrivate = PagePrivate::get(m_page);
648 const QTransform matrix = pagePrivate ? pagePrivate->rotationMatrix() : QTransform();
650
651 for (TextEntity::List::ConstIterator it = sp->it_begin;; it++) {
652 const TextEntity &curEntity = *it;
653 ret->append(curEntity.transformedArea(matrix));
654
655 if (it == sp->it_end) {
656 break;
657 }
658 }
659
660 ret->simplify();
661 return ret;
662}
663
664RegularAreaRect *TextPagePrivate::findTextInternalForward(int searchID, const QString &_query, TextComparisonFunction comparer, TextEntity::List::ConstIterator start, int start_offset, TextEntity::List::ConstIterator end)
665{
666 // normalize query search all unicode (including glyphs)
668
669 // j is the current position in our query
670 // queryLeft is the length of the query we have left to match
671 int j = 0, queryLeft = query.length();
672
674 int offset = start_offset;
675
677 int offset_begin = 0; // dummy initial value to suppress compiler warnings
678
679 while (it != end) {
680 const TextEntity &curEntity = *it;
681 const QString &str = curEntity.text();
682 const int strLen = str.length();
683 const int adjustedLen = stringLengthAdaptedWithHyphen(str, it, m_words.constEnd());
684 // adjustedLen <= strLen
685
686 if (offset >= strLen) {
687 it++;
688 offset = 0;
689 continue;
690 }
691
692 if (it_begin == TextEntity::List::ConstIterator()) {
693 it_begin = it;
694 offset_begin = offset;
695 }
696
697 // Let the user write the hyphen or not when searching for text
698 int matchedLen = -1;
699 for (int matchingLen = strLen; matchingLen >= adjustedLen; matchingLen--) {
700 // we have equal (or less than) area of the query left as the length of the current
701 // entity
702 const int min = qMin(queryLeft, matchingLen - offset);
703 if (comparer(QStringView {str}.mid(offset, min), QStringView {query}.mid(j, min))) {
704 matchedLen = min;
705 break;
706 }
707 }
708
709 if (matchedLen == -1) {
710 // we have not matched
711 // this means we do not have a complete match
712 // we need to get back to query start
713 // and continue the search from this place
714#ifdef DEBUG_TEXTPAGE
715 qCDebug(OkularCoreDebug) << "\tnot matched";
716#endif
717 j = 0;
718 queryLeft = query.length();
719 it = it_begin;
720 offset = offset_begin + 1;
722 } else {
723 // we have a match
724 // move the current position in the query
725 // to the position after the length of this string
726 // we matched
727 // subtract the length of the current entity from
728 // the left length of the query
729#ifdef DEBUG_TEXTPAGE
730 qCDebug(OkularCoreDebug) << "\tmatched" << matchedLen;
731#endif
732 j += matchedLen;
733 queryLeft -= matchedLen;
734
735 if (queryLeft == 0) {
736 // save or update the search point for the current searchID
737 QMap<int, SearchPoint *>::iterator sIt = m_searchPoints.find(searchID);
738 if (sIt == m_searchPoints.end()) {
739 sIt = m_searchPoints.insert(searchID, new SearchPoint);
740 }
741 SearchPoint *sp = *sIt;
742 sp->it_begin = it_begin;
743 sp->it_end = it;
744 sp->offset_begin = offset_begin;
745 sp->offset_end = offset + matchedLen;
746 return searchPointToArea(sp);
747 }
748
749 it++;
750 offset = 0;
751 }
752 }
753 // end of loop - it means that we've ended the textentities
754
755 const QMap<int, SearchPoint *>::iterator sIt = m_searchPoints.find(searchID);
756 if (sIt != m_searchPoints.end()) {
757 SearchPoint *sp = *sIt;
758 m_searchPoints.erase(sIt);
759 delete sp;
760 }
761 return nullptr;
762}
763
764RegularAreaRect *TextPagePrivate::findTextInternalBackward(int searchID, const QString &_query, TextComparisonFunction comparer, TextEntity::List::ConstIterator start, int start_offset, TextEntity::List::ConstIterator end)
765{
766 // normalize query to search all unicode (including glyphs)
768
769 // j is the current position in our query
770 // len is the length of the string in TextEntity
771 // queryLeft is the length of the query we have left
772 int j = query.length(), queryLeft = query.length();
773
775 int offset = start_offset;
776
778 int offset_begin = 0; // dummy initial value to suppress compiler warnings
779
780 while (true) {
781 if (offset <= 0) {
782 if (it == end) {
783 break;
784 }
785 it--;
786 }
787
788 const TextEntity &curEntity = *it;
789 const QString &str = curEntity.text();
790 const int strLen = str.length();
791 const int adjustedLen = stringLengthAdaptedWithHyphen(str, it, m_words.constEnd());
792 // adjustedLen <= strLen
793
794 if (offset <= 0) {
795 offset = strLen;
796 }
797
798 if (it_begin == TextEntity::List::ConstIterator()) {
799 it_begin = it;
800 offset_begin = offset;
801 }
802
803 // Let the user write the hyphen or not when searching for text
804 int matchedLen = -1;
805 // we have equal (or less than) area of the query left as the length of the current
806 // entity
807 for (int matchingLen = strLen; matchingLen >= adjustedLen; matchingLen--) {
808 const int hyphenOffset = (strLen - matchingLen);
809 const int min = qMin(queryLeft + hyphenOffset, offset);
810 if (comparer(QStringView {str}.mid(offset - min, min - hyphenOffset), QStringView {query}.mid(j - min + hyphenOffset, min - hyphenOffset))) {
811 matchedLen = min - hyphenOffset;
812 break;
813 }
814 }
815
816 if (matchedLen == -1) {
817 // we have not matched
818 // this means we do not have a complete match
819 // we need to get back to query start
820 // and continue the search from this place
821#ifdef DEBUG_TEXTPAGE
822 qCDebug(OkularCoreDebug) << "\tnot matched";
823#endif
824
825 j = query.length();
826 queryLeft = query.length();
827 it = it_begin;
828 offset = offset_begin - 1;
830 } else {
831 // we have a match
832 // move the current position in the query
833 // to the position after the length of this string
834 // we matched
835 // subtract the length of the current entity from
836 // the left length of the query
837#ifdef DEBUG_TEXTPAGE
838 qCDebug(OkularCoreDebug) << "\tmatched";
839#endif
840 j -= matchedLen;
841 queryLeft -= matchedLen;
842
843 if (queryLeft == 0) {
844 // save or update the search point for the current searchID
845 QMap<int, SearchPoint *>::iterator sIt = m_searchPoints.find(searchID);
846 if (sIt == m_searchPoints.end()) {
847 sIt = m_searchPoints.insert(searchID, new SearchPoint);
848 }
849 SearchPoint *sp = *sIt;
850 sp->it_begin = it;
851 sp->it_end = it_begin;
852 sp->offset_begin = offset - matchedLen;
853 sp->offset_end = offset_begin;
854 return searchPointToArea(sp);
855 }
856
857 offset = 0;
858 }
859 }
860 // end of loop - it means that we've ended the textentities
861
862 const QMap<int, SearchPoint *>::iterator sIt = m_searchPoints.find(searchID);
863 if (sIt != m_searchPoints.end()) {
864 SearchPoint *sp = *sIt;
865 m_searchPoints.erase(sIt);
866 delete sp;
867 }
868 return nullptr;
869}
870
872{
874}
875
877{
878 if (area && area->isNull()) {
879 return QString();
880 }
881
882 TextEntity::List::ConstIterator it = d->m_words.constBegin(), itEnd = d->m_words.constEnd();
883 QString ret;
884 if (area) {
885 for (; it != itEnd; ++it) {
887 if (area->intersects(it->area())) {
888 ret += it->text();
889 }
890 } else {
891 NormalizedPoint center = it->area().center();
892 if (area->contains(center.x, center.y)) {
893 ret += it->text();
894 }
895 }
896 }
897 } else {
898 for (; it != itEnd; ++it) {
899 ret += it->text();
900 }
901 }
902 return ret;
903}
904
905static bool compareTinyTextEntityX(const WordWithCharacters &first, const WordWithCharacters &second)
906{
907 QRect firstArea = first.area().roundedGeometry(1000, 1000);
908 QRect secondArea = second.area().roundedGeometry(1000, 1000);
909
910 return firstArea.left() < secondArea.left();
911}
912
913static bool compareTinyTextEntityY(const WordWithCharacters &first, const WordWithCharacters &second)
914{
915 const QRect firstArea = first.area().roundedGeometry(1000, 1000);
916 const QRect secondArea = second.area().roundedGeometry(1000, 1000);
917
918 return firstArea.top() < secondArea.top();
919}
920
921/**
922 * Sets a new world list. Deleting the contents of the old one
923 */
924void TextPagePrivate::setWordList(const TextEntity::List &list)
925{
926 m_words = list;
927}
928
929/**
930 * Remove all the spaces in between texts. It will make all the generators
931 * same, whether they save spaces(like pdf) or not(like djvu).
932 */
933static void removeSpace(TextEntity::List *words)
934{
935 TextEntity::List::Iterator it = words->begin();
936 const QString str(QLatin1Char(' '));
937
938 while (it != words->end()) {
939 if (it->text() == str) {
940 it = words->erase(it);
941 } else {
942 ++it;
943 }
944 }
945}
946
947/**
948 * We will read the TinyTextEntity from characters and try to create words from there.
949 * Note: characters might be already characters for some generators, but we will keep
950 * the nomenclature characters for the generator produced data. The resulting
951 * WordsWithCharacters memory has to be managed by the caller, both the
952 * WordWithCharacters::word and WordWithCharacters::characters contents
953 */
954static WordsWithCharacters makeWordFromCharacters(const TextEntity::List &characters, int pageWidth, int pageHeight)
955{
956 /**
957 * We will traverse characters and try to create words from the TinyTextEntities in it.
958 * We will search TinyTextEntity blocks and merge them until we get a
959 * space between two consecutive TinyTextEntities. When we get a space
960 * we can take it as a end of word. Then we store the word as a TinyTextEntity
961 * and keep it in newList.
962
963 * We create a RegionText named regionWord that contains the word and the characters associated with it and
964 * a rectangle area of the element in newList.
965
966 */
967 WordsWithCharacters wordsWithCharacters;
968
969 TextEntity::List::ConstIterator it = characters.begin(), itEnd = characters.end(), tmpIt;
970 int newLeft, newRight, newTop, newBottom;
971
972 for (; it != itEnd; it++) {
973 QString textString = it->text();
974 QString newString;
975 QRect lineArea = it->area().roundedGeometry(pageWidth, pageHeight);
976 QRect elementArea;
977 TextEntity::List wordCharacters;
978 tmpIt = it;
979 int space = 0;
980
981 while (!space) {
982 if (!textString.isEmpty()) {
983 newString.append(textString);
984
985 // when textString is the start of the word
986 if (tmpIt == it) {
987 NormalizedRect newRect(lineArea, pageWidth, pageHeight);
988 wordCharacters.append(TextEntity(textString.normalized(QString::NormalizationForm_KC), newRect));
989 } else {
990 NormalizedRect newRect(elementArea, pageWidth, pageHeight);
991 wordCharacters.append(TextEntity(textString.normalized(QString::NormalizationForm_KC), newRect));
992 }
993 }
994
995 ++it;
996
997 /*
998 we must have to put this line before the if condition of it==itEnd
999 otherwise the last character can be missed
1000 */
1001 if (it == itEnd) {
1002 break;
1003 }
1004 elementArea = it->area().roundedGeometry(pageWidth, pageHeight);
1005 if (!doesConsumeY(elementArea, lineArea, 60)) {
1006 --it;
1007 break;
1008 }
1009
1010 const int text_y1 = elementArea.top(), text_x1 = elementArea.left(), text_y2 = elementArea.y() + elementArea.height(), text_x2 = elementArea.x() + elementArea.width();
1011 const int line_y1 = lineArea.top(), line_x1 = lineArea.left(), line_y2 = lineArea.y() + lineArea.height(), line_x2 = lineArea.x() + lineArea.width();
1012
1013 space = elementArea.left() - lineArea.right();
1014
1015 if (space != 0) {
1016 it--;
1017 break;
1018 }
1019
1020 newLeft = text_x1 < line_x1 ? text_x1 : line_x1;
1021 newRight = line_x2 > text_x2 ? line_x2 : text_x2;
1022 newTop = text_y1 > line_y1 ? line_y1 : text_y1;
1023 newBottom = text_y2 > line_y2 ? text_y2 : line_y2;
1024
1025 lineArea.setLeft(newLeft);
1026 lineArea.setTop(newTop);
1027 lineArea.setWidth(newRight - newLeft);
1028 lineArea.setHeight(newBottom - newTop);
1029
1030 textString = it->text();
1031 }
1032
1033 // if newString is not empty, save it
1034 if (!newString.isEmpty()) {
1035 const NormalizedRect newRect(lineArea, pageWidth, pageHeight);
1037 wordsWithCharacters.append(WordWithCharacters(word, wordCharacters));
1038 }
1039
1040 if (it == itEnd) {
1041 break;
1042 }
1043 }
1044
1045 wordsWithCharacters.shrink_to_fit();
1046 return wordsWithCharacters;
1047}
1048
1049/**
1050 * Create Lines from the words and sort them
1051 */
1052QList<QPair<WordsWithCharacters, QRect>> makeAndSortLines(const WordsWithCharacters &wordsTmp, int pageWidth, int pageHeight)
1053{
1054 /**
1055 * We cannot assume that the generator will give us texts in the right order.
1056 * We can only assume that we will get texts in the page and their bounding
1057 * rectangle. The texts can be character, word, half-word anything.
1058 * So, we need to:
1059 **
1060 * 1. Sort rectangles/boxes containing texts by y0(top)
1061 * 2. Create textline where there is y overlap between TinyTextEntity 's
1062 * 3. Within each line sort the TinyTextEntity 's by x0(left)
1063 */
1064
1066
1067 /*
1068 Make a new copy of the TextList in the words, so that the wordsTmp and lines do
1069 not contain same pointers for all the TinyTextEntity.
1070 */
1071 QList<WordWithCharacters> words = wordsTmp;
1072
1073 // Step 1
1074 std::sort(words.begin(), words.end(), compareTinyTextEntityY);
1075
1076 // Step 2
1077 QList<WordWithCharacters>::Iterator it = words.begin(), itEnd = words.end();
1078
1079 // for every non-space texts(characters/words) in the textList
1080 for (; it != itEnd; it++) {
1081 const QRect elementArea = (*it).area().roundedGeometry(pageWidth, pageHeight);
1082 bool found = false;
1083
1084 for (QPair<WordsWithCharacters, QRect> &linesI : lines) {
1085 /* the line area which will be expanded
1086 line_rects is only necessary to preserve the topmin and bottommax of all
1087 the texts in the line, left and right is not necessary at all
1088 */
1089 QRect &lineArea = linesI.second;
1090 const int text_y1 = elementArea.top(), text_y2 = elementArea.top() + elementArea.height(), text_x1 = elementArea.left(), text_x2 = elementArea.left() + elementArea.width();
1091 const int line_y1 = lineArea.top(), line_y2 = lineArea.top() + lineArea.height(), line_x1 = lineArea.left(), line_x2 = lineArea.left() + lineArea.width();
1092
1093 /*
1094 if the new text and the line has y overlapping parts of more than 70%,
1095 the text will be added to this line
1096 */
1097 if (doesConsumeY(elementArea, lineArea, 70)) {
1098 WordsWithCharacters &line = linesI.first;
1099 line.append(*it);
1100
1101 const int newLeft = line_x1 < text_x1 ? line_x1 : text_x1;
1102 const int newRight = line_x2 > text_x2 ? line_x2 : text_x2;
1103 const int newTop = line_y1 < text_y1 ? line_y1 : text_y1;
1104 const int newBottom = text_y2 > line_y2 ? text_y2 : line_y2;
1105
1106 lineArea = QRect(newLeft, newTop, newRight - newLeft, newBottom - newTop);
1107 found = true;
1108 }
1109
1110 if (found) {
1111 break;
1112 }
1113 }
1114
1115 /* when we have found a new line create a new TextList containing
1116 only one element and append it to the lines
1117 */
1118 if (!found) {
1119 lines.append(QPair<WordsWithCharacters, QRect>({*it}, elementArea));
1120 }
1121 }
1122
1123 // Step 3
1124 for (QPair<WordsWithCharacters, QRect> &line : lines) {
1126 std::sort(list.begin(), list.end(), compareTinyTextEntityX);
1127 }
1128 lines.shrink_to_fit();
1129
1130 return lines;
1131}
1132
1133/**
1134 * Calculate Statistical information from the lines we made previously
1135 */
1136static void calculateStatisticalInformation(const QList<WordWithCharacters> &words, int pageWidth, int pageHeight, int *word_spacing, int *line_spacing, int *col_spacing)
1137{
1138 /**
1139 * For the region, defined by line_rects and lines
1140 * 1. Make line statistical analysis to find the line spacing
1141 * 2. Make character statistical analysis to differentiate between
1142 * word spacing and column spacing.
1143 */
1144
1145 /**
1146 * Step 0
1147 */
1148 const QList<QPair<WordsWithCharacters, QRect>> sortedLines = makeAndSortLines(words, pageWidth, pageHeight);
1149
1150 /**
1151 * Step 1
1152 */
1153 QMap<int, int> line_space_stat;
1154 for (int i = 0; i < sortedLines.length(); i++) {
1155 const QRect rectUpper = sortedLines.at(i).second;
1156
1157 if (i + 1 == sortedLines.length()) {
1158 break;
1159 }
1160 const QRect rectLower = sortedLines.at(i + 1).second;
1161
1162 int linespace = rectLower.top() - (rectUpper.top() + rectUpper.height());
1163 if (linespace < 0) {
1164 linespace = -linespace;
1165 }
1166
1167 if (line_space_stat.contains(linespace)) {
1168 line_space_stat[linespace]++;
1169 } else {
1170 line_space_stat[linespace] = 1;
1171 }
1172 }
1173
1174 *line_spacing = 0;
1175 int weighted_count = 0;
1176 QMapIterator<int, int> iterate_linespace(line_space_stat);
1177
1178 while (iterate_linespace.hasNext()) {
1179 iterate_linespace.next();
1180 *line_spacing += iterate_linespace.value() * iterate_linespace.key();
1181 weighted_count += iterate_linespace.value();
1182 }
1183 if (*line_spacing != 0) {
1184 *line_spacing = (int)((double)*line_spacing / (double)weighted_count + 0.5);
1185 }
1186
1187 /**
1188 * Step 2
1189 */
1190 // We would like to use QMap instead of QHash as it will keep the keys sorted
1191 QMap<int, int> hor_space_stat;
1192 QMap<int, int> col_space_stat;
1193 QList<QList<QRect>> space_rects;
1194 QVector<QRect> max_hor_space_rects;
1195
1196 // Space in every line
1197 for (const QPair<WordsWithCharacters, QRect> &sortedLine : sortedLines) {
1198 const WordsWithCharacters list = sortedLine.first;
1199 QList<QRect> line_space_rects;
1200 int maxSpace = 0, minSpace = pageWidth;
1201
1202 // for every TinyTextEntity element in the line
1204 QRect max_area1, max_area2;
1205 // for every line
1206 for (; it != itEnd; it++) {
1207 const QRect area1 = (*it).area().roundedGeometry(pageWidth, pageHeight);
1208 if (it + 1 == itEnd) {
1209 break;
1210 }
1211
1212 const QRect area2 = (*(it + 1)).area().roundedGeometry(pageWidth, pageHeight);
1213 int space = area2.left() - area1.right();
1214
1215 if (space > maxSpace) {
1216 max_area1 = area1;
1217 max_area2 = area2;
1218 maxSpace = space;
1219 }
1220
1221 if (space < minSpace && space != 0) {
1222 minSpace = space;
1223 }
1224
1225 // if we found a real space, whose length is not zero and also less than the pageWidth
1226 if (space != 0 && space != pageWidth) {
1227 // increase the count of the space amount
1228 if (hor_space_stat.contains(space)) {
1229 hor_space_stat[space]++;
1230 } else {
1231 hor_space_stat[space] = 1;
1232 }
1233
1234 int left, right, top, bottom;
1235
1236 left = area1.right();
1237 right = area2.left();
1238
1239 top = area2.top() < area1.top() ? area2.top() : area1.top();
1240 bottom = area2.bottom() > area1.bottom() ? area2.bottom() : area1.bottom();
1241
1242 QRect rect(left, top, right - left, bottom - top);
1243 line_space_rects.append(rect);
1244 }
1245 }
1246
1247 space_rects.append(line_space_rects);
1248
1249 if (hor_space_stat.contains(maxSpace)) {
1250 if (hor_space_stat[maxSpace] != 1) {
1251 hor_space_stat[maxSpace]--;
1252 } else {
1253 hor_space_stat.remove(maxSpace);
1254 }
1255 }
1256
1257 if (maxSpace != 0) {
1258 if (col_space_stat.contains(maxSpace)) {
1259 col_space_stat[maxSpace]++;
1260 } else {
1261 col_space_stat[maxSpace] = 1;
1262 }
1263
1264 // store the max rect of each line
1265 const int left = max_area1.right();
1266 const int right = max_area2.left();
1267 const int top = (max_area1.top() > max_area2.top()) ? max_area2.top() : max_area1.top();
1268 const int bottom = (max_area1.bottom() < max_area2.bottom()) ? max_area2.bottom() : max_area1.bottom();
1269
1270 const QRect rect(left, top, right - left, bottom - top);
1271 max_hor_space_rects.append(rect);
1272 } else {
1273 max_hor_space_rects.append(QRect(0, 0, 0, 0));
1274 }
1275 }
1276
1277 // All the between word space counts are in hor_space_stat
1278 *word_spacing = 0;
1279 weighted_count = 0;
1280 QMapIterator<int, int> iterate(hor_space_stat);
1281
1282 while (iterate.hasNext()) {
1283 iterate.next();
1284
1285 if (iterate.key() > 0) {
1286 *word_spacing += iterate.value() * iterate.key();
1287 weighted_count += iterate.value();
1288 }
1289 }
1290 if (weighted_count) {
1291 *word_spacing = (int)((double)*word_spacing / (double)weighted_count + 0.5);
1292 }
1293
1294 *col_spacing = 0;
1295 QMapIterator<int, int> iterate_col(col_space_stat);
1296
1297 while (iterate_col.hasNext()) {
1298 iterate_col.next();
1299 if (iterate_col.value() > *col_spacing) {
1300 *col_spacing = iterate_col.value();
1301 }
1302 }
1303 *col_spacing = col_space_stat.key(*col_spacing);
1304
1305 // if there is just one line in a region, there is no point in dividing it
1306 if (sortedLines.length() == 1) {
1307 *word_spacing = *col_spacing;
1308 }
1309}
1310
1311/**
1312 * Implements the XY Cut algorithm for textpage segmentation
1313 * The resulting RegionTextList will contain RegionText whose WordsWithCharacters::word and
1314 * WordsWithCharacters::characters are reused from wordsWithCharacters (i.e. no new nor delete happens in this function)
1315 */
1316static RegionTextList XYCutForBoundingBoxes(const QList<WordWithCharacters> &wordsWithCharacters, int pageWidth, int pageHeight)
1317{
1318 RegionTextList tree;
1319 QRect contentRect(0, 0, pageWidth, pageHeight);
1320 const RegionText root(wordsWithCharacters, contentRect);
1321
1322 // start the tree with the root, it is our only region at the start
1323 tree.push_back(root);
1324
1325 int i = 0;
1326
1327 // while traversing the tree has not been ended
1328 while (i < tree.length()) {
1329 const RegionText node = tree.at(i);
1330 QRect regionRect = node.area();
1331
1332 /**
1333 * 1. calculation of projection profiles
1334 */
1335 // allocate the size of proj profiles and initialize with 0
1336 int size_proj_y = node.area().height();
1337 int size_proj_x = node.area().width();
1338 // dynamic memory allocation
1339 QVarLengthArray<int> proj_on_xaxis(size_proj_x);
1340 QVarLengthArray<int> proj_on_yaxis(size_proj_y);
1341
1342 for (int j = 0; j < size_proj_y; ++j) {
1343 proj_on_yaxis[j] = 0;
1344 }
1345 for (int j = 0; j < size_proj_x; ++j) {
1346 proj_on_xaxis[j] = 0;
1347 }
1348
1349 const QList<WordWithCharacters> list = node.text();
1350
1351 // Calculate tcx and tcy locally for each new region
1352 int word_spacing, line_spacing, column_spacing;
1353 calculateStatisticalInformation(list, pageWidth, pageHeight, &word_spacing, &line_spacing, &column_spacing);
1354
1355 const int tcx = word_spacing * 2;
1356 const int tcy = line_spacing * 2;
1357
1358 int maxX = 0, maxY = 0;
1359 int avgX = 0;
1360 int count;
1361
1362 // for every text in the region
1363 for (const WordWithCharacters &wwc : list) {
1364 const QRect entRect = wwc.area().geometry(pageWidth, pageHeight);
1365
1366 // calculate vertical projection profile proj_on_xaxis1
1367 for (int k = entRect.left(); k <= entRect.left() + entRect.width(); ++k) {
1368 if ((k - regionRect.left()) < size_proj_x && (k - regionRect.left()) >= 0) {
1369 proj_on_xaxis[k - regionRect.left()] += entRect.height();
1370 }
1371 }
1372
1373 // calculate horizontal projection profile in the same way
1374 for (int k = entRect.top(); k <= entRect.top() + entRect.height(); ++k) {
1375 if ((k - regionRect.top()) < size_proj_y && (k - regionRect.top()) >= 0) {
1376 proj_on_yaxis[k - regionRect.top()] += entRect.width();
1377 }
1378 }
1379 }
1380
1381 for (int j = 0; j < size_proj_y; ++j) {
1382 if (proj_on_yaxis[j] > maxY) {
1383 maxY = proj_on_yaxis[j];
1384 }
1385 }
1386
1387 avgX = count = 0;
1388 for (int j = 0; j < size_proj_x; ++j) {
1389 if (proj_on_xaxis[j] > maxX) {
1390 maxX = proj_on_xaxis[j];
1391 }
1392 if (proj_on_xaxis[j]) {
1393 count++;
1394 avgX += proj_on_xaxis[j];
1395 }
1396 }
1397 if (count) {
1398 avgX /= count;
1399 }
1400
1401 /**
1402 * 2. Cleanup Boundary White Spaces and removal of noise
1403 */
1404 int xbegin = 0, xend = size_proj_x - 1;
1405 int ybegin = 0, yend = size_proj_y - 1;
1406 while (xbegin < size_proj_x && proj_on_xaxis[xbegin] <= 0) {
1407 xbegin++;
1408 }
1409 while (xend >= 0 && proj_on_xaxis[xend] <= 0) {
1410 xend--;
1411 }
1412 while (ybegin < size_proj_y && proj_on_yaxis[ybegin] <= 0) {
1413 ybegin++;
1414 }
1415 while (yend >= 0 && proj_on_yaxis[yend] <= 0) {
1416 yend--;
1417 }
1418
1419 // update the regionRect
1420 int old_left = regionRect.left(), old_top = regionRect.top();
1421 regionRect.setLeft(old_left + xbegin);
1422 regionRect.setRight(old_left + xend);
1423 regionRect.setTop(old_top + ybegin);
1424 regionRect.setBottom(old_top + yend);
1425
1426 int tnx = (int)((double)avgX * 10.0 / 100.0 + 0.5), tny = 0;
1427 for (int j = 0; j < size_proj_x; ++j) {
1428 proj_on_xaxis[j] -= tnx;
1429 }
1430 for (int j = 0; j < size_proj_y; ++j) {
1431 proj_on_yaxis[j] -= tny;
1432 }
1433
1434 /**
1435 * 3. Find the Widest gap
1436 */
1437 int gap_hor = -1, pos_hor = -1;
1438 int begin = -1, end = -1;
1439
1440 // find all hor_gaps and find the maximum between them
1441 for (int j = 1; j < size_proj_y; ++j) {
1442 // transition from white to black
1443 if (begin >= 0 && proj_on_yaxis[j - 1] <= 0 && proj_on_yaxis[j] > 0) {
1444 end = j;
1445 }
1446
1447 // transition from black to white
1448 if (proj_on_yaxis[j - 1] > 0 && proj_on_yaxis[j] <= 0) {
1449 begin = j;
1450 }
1451
1452 if (begin > 0 && end > 0 && end - begin > gap_hor) {
1453 gap_hor = end - begin;
1454 pos_hor = (end + begin) / 2;
1455 begin = -1;
1456 end = -1;
1457 }
1458 }
1459
1460 begin = -1, end = -1;
1461 int gap_ver = -1, pos_ver = -1;
1462
1463 // find all the ver_gaps and find the maximum between them
1464 for (int j = 1; j < size_proj_x; ++j) {
1465 // transition from white to black
1466 if (begin >= 0 && proj_on_xaxis[j - 1] <= 0 && proj_on_xaxis[j] > 0) {
1467 end = j;
1468 }
1469
1470 // transition from black to white
1471 if (proj_on_xaxis[j - 1] > 0 && proj_on_xaxis[j] <= 0) {
1472 begin = j;
1473 }
1474
1475 if (begin > 0 && end > 0 && end - begin > gap_ver) {
1476 gap_ver = end - begin;
1477 pos_ver = (end + begin) / 2;
1478 begin = -1;
1479 end = -1;
1480 }
1481 }
1482
1483 int cut_pos_x = pos_ver, cut_pos_y = pos_hor;
1484 int gap_x = gap_ver, gap_y = gap_hor;
1485
1486 /**
1487 * 4. Cut the region and make nodes (left,right) or (up,down)
1488 */
1489 bool cut_hor = false, cut_ver = false;
1490
1491 // For horizontal cut
1492 const int topHeight = cut_pos_y - (regionRect.top() - old_top);
1493 const QRect topRect(regionRect.left(), regionRect.top(), regionRect.width(), topHeight);
1494 const QRect bottomRect(regionRect.left(), regionRect.top() + topHeight, regionRect.width(), regionRect.height() - topHeight);
1495
1496 // For vertical Cut
1497 const int leftWidth = cut_pos_x - (regionRect.left() - old_left);
1498 const QRect leftRect(regionRect.left(), regionRect.top(), leftWidth, regionRect.height());
1499 const QRect rightRect(regionRect.left() + leftWidth, regionRect.top(), regionRect.width() - leftWidth, regionRect.height());
1500
1501 if (gap_y >= gap_x && gap_y >= tcy) {
1502 cut_hor = true;
1503 } else if (gap_y >= gap_x && gap_y <= tcy && gap_x >= tcx) {
1504 cut_ver = true;
1505 } else if (gap_x >= gap_y && gap_x >= tcx) {
1506 cut_ver = true;
1507 } else if (gap_x >= gap_y && gap_x <= tcx && gap_y >= tcy) {
1508 cut_hor = true;
1509 } else {
1510 // no cut possible
1511 // we can now update the node rectangle with the shrinked rectangle
1512 RegionText tmpNode = tree.at(i);
1513 tmpNode.setArea(regionRect);
1514 tree.replace(i, tmpNode);
1515 i++;
1516 continue;
1517 }
1518
1519 WordsWithCharacters list1, list2;
1520
1521 // horizontal cut, topRect and bottomRect
1522 if (cut_hor) {
1523 for (const WordWithCharacters &word : list) {
1524 const QRect wordRect = word.area().geometry(pageWidth, pageHeight);
1525
1526 if (topRect.intersects(wordRect)) {
1527 list1.append(word);
1528 } else {
1529 list2.append(word);
1530 }
1531 }
1532
1533 RegionText node1(list1, topRect);
1534 RegionText node2(list2, bottomRect);
1535
1536 tree.replace(i, node1);
1537 tree.insert(i + 1, node2);
1538 }
1539
1540 // vertical cut, leftRect and rightRect
1541 else if (cut_ver) {
1542 for (const WordWithCharacters &word : list) {
1543 const QRect wordRect = word.area().geometry(pageWidth, pageHeight);
1544
1545 if (leftRect.intersects(wordRect)) {
1546 list1.append(word);
1547 } else {
1548 list2.append(word);
1549 }
1550 }
1551
1552 RegionText node1(list1, leftRect);
1553 RegionText node2(list2, rightRect);
1554
1555 tree.replace(i, node1);
1556 tree.insert(i + 1, node2);
1557 }
1558 }
1559
1560 tree.shrink_to_fit();
1561 return tree;
1562}
1563
1564/**
1565 * Add spaces in between words in a line. It reuses the pointers passed in tree and might add new ones. You will need to take care of deleting them if needed
1566 */
1567TextEntity::List addNecessarySpace(RegionTextList tree, int pageWidth, int pageHeight)
1568{
1569 /**
1570 * 1. Call makeAndSortLines before adding spaces in between words in a line
1571 * 2. Now add spaces between every two words in a line
1572 * 3. Finally, extract all the space separated texts from each region and return it
1573 */
1574
1575 TextEntity::List res;
1576 // Only change the texts under RegionTexts, not the area
1577 for (const RegionText &tmpRegion : std::as_const(tree)) {
1578 // Step 01
1579 QList<QPair<WordsWithCharacters, QRect>> sortedLines = makeAndSortLines(tmpRegion.text(), pageWidth, pageHeight);
1580 int counter = 0;
1581
1582 // Step 02
1583 for (QPair<WordsWithCharacters, QRect> &sortedLine : sortedLines) {
1584 WordsWithCharacters &list = sortedLine.first;
1585 for (int k = 0; k < list.length(); k++) {
1586 const QRect area1 = list.at(k).area().roundedGeometry(pageWidth, pageHeight);
1587 if (k + 1 >= list.length()) {
1588 break;
1589 }
1590
1591 const QRect area2 = list.at(k + 1).area().roundedGeometry(pageWidth, pageHeight);
1592 const int space = area2.left() - area1.right();
1593
1594 if (space != 0) {
1595 // Make a TinyTextEntity of string space and push it between it and it+1
1596 const int left = area1.right();
1597 const int right = area2.left();
1598 const int top = area2.top() < area1.top() ? area2.top() : area1.top();
1599 const int bottom = area2.bottom() > area1.bottom() ? area2.bottom() : area1.bottom();
1600
1601 const QString spaceStr(QStringLiteral(" "));
1602 const QRect rect(QPoint(left, top), QPoint(right, bottom));
1603 const NormalizedRect entRect(rect, pageWidth, pageHeight);
1604 TextEntity ent1 = TextEntity(spaceStr, entRect);
1605 WordWithCharacters word(ent1, QList<TextEntity>() << ent1);
1606
1607 list.insert(k + 1, word);
1608
1609 // Skip the space
1610 k++;
1611 }
1612 }
1613 counter += list.length();
1614 }
1615 res.reserve(res.length() + counter);
1616
1617 for (const QPair<WordsWithCharacters, QRect> &sortedLine : std::as_const(sortedLines)) {
1618 for (const WordWithCharacters &word : sortedLine.first) {
1619 res += word.characters;
1620 }
1621 }
1622 }
1623
1624 // Step 03
1625 res.shrink_to_fit();
1626 return res;
1627}
1628
1629/**
1630 * Correct the textOrder, all layout recognition works here
1631 */
1632void TextPagePrivate::correctTextOrder()
1633{
1634 // m_page->width() and m_page->height() are in pixels at
1635 // 100% zoom level, and thus depend on display DPI.
1636 // To avoid Okular failing on lowDPI displays,
1637 // we scale pageWidth and pageHeight so their sum equals 2000.
1638 const double scalingFactor = 2000.0 / (m_page->width() + m_page->height());
1639 const int pageWidth = (int)(scalingFactor * m_page->width());
1640 const int pageHeight = (int)(scalingFactor * m_page->height());
1641
1642 TextEntity::List characters = m_words;
1643
1644 /**
1645 * Remove spaces from the text
1646 */
1647 removeSpace(&characters);
1648
1649 /**
1650 * Construct words from characters
1651 */
1652 const QList<WordWithCharacters> wordsWithCharacters = makeWordFromCharacters(characters, pageWidth, pageHeight);
1653
1654 /**
1655 * Make a XY Cut tree for segmentation of the texts
1656 */
1657 RegionTextList tree = XYCutForBoundingBoxes(wordsWithCharacters, pageWidth, pageHeight);
1658
1659 /**
1660 * Add spaces to the word
1661 */
1662 const auto listOfCharacters = addNecessarySpace(std::move(tree), pageWidth, pageHeight);
1663
1664 setWordList(listOfCharacters);
1665}
1666
1668{
1669 if (area && area->isNull()) {
1670 return TextEntity::List();
1671 }
1672
1673 TextEntity::List ret;
1674 if (area) {
1675 for (const TextEntity &te : std::as_const(d->m_words)) {
1677 if (area->intersects(te.area())) {
1678 ret.append(te);
1679 }
1680 } else {
1681 const NormalizedPoint center = te.area().center();
1682 if (area->contains(center.x, center.y)) {
1683 ret.append(te);
1684 }
1685 }
1686 }
1687 } else {
1688 for (const TextEntity &te : std::as_const(d->m_words)) {
1689 ret.append(te);
1690 }
1691 }
1692 return ret;
1693}
1694
1696{
1697 TextEntity::List::ConstIterator itBegin = d->m_words.constBegin(), itEnd = d->m_words.constEnd();
1699 TextEntity::List::ConstIterator posIt = itEnd;
1700 for (; it != itEnd; ++it) {
1701 if (it->area().contains(p.x, p.y)) {
1702 posIt = it;
1703 break;
1704 }
1705 }
1706 if (posIt != itEnd) {
1707 if (posIt->text().simplified().isEmpty()) {
1708 return nullptr;
1709 }
1710 // Find the first TinyTextEntity of the word
1711 while (posIt != itBegin) {
1712 --posIt;
1713 const QString itText = posIt->text();
1714 if (itText.right(1).at(0).isSpace()) {
1715 if (itText.endsWith(QLatin1String("-\n"))) {
1716 // Is an hyphenated word
1717 // continue searching the start of the word back
1718 continue;
1719 }
1720
1721 if (itText == QLatin1String("\n") && posIt != itBegin) {
1722 --posIt;
1723 if (posIt->text().endsWith(QLatin1String("-"))) {
1724 // Is an hyphenated word
1725 // continue searching the start of the word back
1726 continue;
1727 }
1728 ++posIt;
1729 }
1730
1731 ++posIt;
1732 break;
1733 }
1734 }
1735 RegularAreaRect *ret = new RegularAreaRect();
1736 QString foundWord;
1737 for (; posIt != itEnd; ++posIt) {
1738 const QString itText = posIt->text();
1739 if (itText.simplified().isEmpty()) {
1740 break;
1741 }
1742
1743 ret->appendShape(posIt->area());
1744 foundWord += posIt->text();
1745 if (itText.right(1).at(0).isSpace()) {
1746 if (!foundWord.endsWith(QLatin1String("-\n"))) {
1747 break;
1748 }
1749 }
1750 }
1751
1752 if (word) {
1753 *word = foundWord;
1754 }
1755 return ret;
1756 } else {
1757 return nullptr;
1758 }
1759}
NormalizedPoint is a helper class which stores the coordinates of a normalized point.
Definition area.h:117
double x
The normalized x coordinate.
Definition area.h:166
double y
The normalized y coordinate.
Definition area.h:171
A NormalizedRect is a rectangle which can be defined by two NormalizedPoints.
Definition area.h:189
double bottom
The normalized bottom coordinate.
Definition area.h:435
bool isRight(const NormalizedPoint &pt) const
Returns true if the point pt is located to the left of the right edge of the rectangle.
Definition area.h:376
bool isLeft(const NormalizedPoint &pt) const
Returns true if the point pt is located to the right of the left edge of the rectangle.
Definition area.h:367
bool contains(double x, double y) const
Returns whether the normalized rectangle contains the normalized point (x, y).
Definition area.cpp:160
bool isTop(const NormalizedPoint &pt) const
Returns true if the point pt is located above the top of the rectangle.
Definition area.h:340
bool intersects(const NormalizedRect &other) const
Returns whether the normalized rectangle intersects the other normalized rectangle.
Definition area.cpp:165
bool isTopOrLevel(const NormalizedPoint &pt) const
Returns true if the point pt is located above the bottom of the rectangle.
Definition area.h:358
bool isBottomOrLevel(const NormalizedPoint &pt) const
Returns true if the point pt is located below the top of the rectangle.
Definition area.h:349
QRect geometry(int xScale, int yScale) const
Returns the rectangle mapped to a reference area of xScale x yScale.
Definition area.cpp:232
double top
The normalized top coordinate.
Definition area.h:425
bool isBottom(const NormalizedPoint &pt) const
Returns true if the point pt is located below the bottom of the rectangle.
Definition area.h:331
QRect roundedGeometry(int xScale, int yScale) const
Same functionality as geometry, but the output is now rounded before typecasting to int.
Definition area.cpp:239
void transform(const QTransform &matrix)
Transforms the normalized rectangle with the operations defined by matrix.
Definition area.cpp:253
This is a list of NormalizedRect, to describe an area consisting of multiple rectangles using normali...
Definition area.h:933
void appendShape(const NormalizedShape &shape, MergeSide side=MergeAll)
Appends the given shape to this area.
Definition area.h:810
bool isNull() const
Returns whether the regular area is a null area.
Definition area.h:751
void simplify()
Simplifies this regular area by merging its intersecting subareas.
Definition area.h:730
bool contains(double x, double y) const
Returns whether this area contains the normalized point (x, y).
Definition area.h:866
bool intersects(const RegularArea< NormalizedShape, Shape > *area) const
Returns whether this area intersects with the given area.
Definition area.h:783
Represents a piece of text on a TextPage, containing its textual representation and its bounding box.
Definition textpage.h:53
QString text() const
Returns the text of the text entity.
Definition textpage.cpp:142
TextEntity(const QString &text, const NormalizedRect &area)
Creates a new text entity with the given text and the given area.
Definition textpage.cpp:134
NormalizedRect area() const
Returns the bounding area of the text entity.
Definition textpage.cpp:147
~TextEntity()
Destroys the text entity.
NormalizedRect transformedArea(const QTransform &matrix) const
Returns the transformed area of the text entity.
Definition textpage.cpp:152
TextEntity::List words(const RegularAreaRect *area, TextAreaInclusionBehaviour b) const
Text entity extraction function.
TextAreaInclusionBehaviour
Defines the behaviour of adding characters to text() result.
Definition textpage.h:113
@ AnyPixelTextAreaInclusionBehaviour
A character is included into text() result if any pixel of his bounding box is in the given area.
Definition textpage.h:114
QString text(const RegularAreaRect *area=nullptr) const
Text extraction function.
Definition textpage.cpp:871
RegularAreaRect * findText(int searchID, const QString &query, SearchDirection direction, Qt::CaseSensitivity caseSensitivity, const RegularAreaRect *area)
Returns the bounding rect of the text which matches the following criteria or 0 if the search is not ...
Definition textpage.cpp:551
void append(const QString &text, const NormalizedRect &area)
Appends the given text with the given area as new TextEntity to the page.
Definition textpage.cpp:185
~TextPage()
Destroys the text page.
Definition textpage.cpp:180
RegularAreaRect * textArea(TextSelection *selection) const
Returns the rectangular area of the given selection.
Definition textpage.cpp:276
RegularAreaRect * wordAt(const NormalizedPoint &p, QString *word=nullptr) const
Returns the area and text of the word at the given point Note that ownership of the returned area bel...
TextPage()
Creates a new text page.
Definition textpage.cpp:169
Wrapper around the information needed to generate the selection area There are two assumptions inside...
Definition misc.h:34
void end(const NormalizedPoint &point)
Changes the end point of the selection to the given point.
Definition misc.cpp:43
NormalizedPoint start() const
Returns the start point of the selection.
Definition misc.cpp:70
Q_SCRIPTABLE Q_NOREPLY void start()
KSERVICE_EXPORT KService::List query(FilterFunc filterFunc)
KIOCORE_EXPORT QStringList list(const QString &fileClass)
const QList< QKeySequence > & begin()
const QList< QKeySequence > & end()
global.h
Definition action.h:17
SearchDirection
Describes the direction of searching.
Definition global.h:36
@ FromTop
Searching from top of the page, next result is to be found, there was no earlier search result.
Definition global.h:37
@ FromBottom
Searching from bottom of the page, next result is to be found, there was no earlier search result.
Definition global.h:38
@ PreviousResult
Searching for the previous result on the page, earlier result should be located so we search from the...
Definition global.h:40
@ NextResult
Searching for the next result on the page, earlier result should be located so we search from the las...
Definition global.h:39
MergeSide
The side(s) to be considered when merging areas.
Definition global.h:64
@ MergeRight
Merge only if the right side of the first area intersect.
Definition global.h:65
bool isSpace() const const
typedef ConstIterator
typedef Iterator
void append(const T &value)
const T & at(int i) const const
QList::iterator begin()
QList::iterator end()
QList::iterator erase(QList::iterator pos)
T & first()
void insert(int i, const T &value)
bool isEmpty() const const
int length() const const
QList< T > mid(int pos, int length) const const
void reserve(int alloc)
bool contains(const Key &key) const const
const Key key(const T &value, const Key &defaultKey) const const
int remove(const Key &key)
int x() const const
int y() const const
int bottom() const const
QPoint center() const const
int height() const const
int left() const const
int right() const const
void setBottom(int y)
void setHeight(int height)
void setLeft(int x)
void setRight(int x)
void setTop(int y)
void setWidth(int width)
int top() const const
int width() const const
int x() const const
int y() const const
NormalizationForm_KC
QString & append(QChar ch)
const QChar at(int position) const const
bool endsWith(const QString &s, Qt::CaseSensitivity cs) const const
QString & insert(int position, QChar ch)
bool isEmpty() const const
int length() const const
QString mid(int position, int n) const const
QString normalized(QString::NormalizationForm mode, QChar::UnicodeVersion version) const const
QString right(int n) const const
QString simplified() const const
bool startsWith(const QString &s, Qt::CaseSensitivity cs) const const
int compare(QStringView str, Qt::CaseSensitivity cs) const const
CaseInsensitive
QTextStream & left(QTextStream &stream)
QTextStream & right(QTextStream &stream)
QTextStream & center(QTextStream &s)
void append(const T &value)
This file is part of the KDE documentation.
Documentation copyright © 1996-2024 The KDE developers.
Generated on Sun Feb 25 2024 18:44:05 by doxygen 1.10.0 written by Dimitri van Heesch, © 1997-2006

KDE's Doxygen guidelines are available online.