Okular

textpage.cpp
1/*
2 SPDX-FileCopyrightText: 2005 Piotr Szymanski <niedakh@gmail.com>
3
4 SPDX-License-Identifier: GPL-2.0-or-later
5*/
6
7#include "textpage.h"
8#include "textpage_p.h"
9
10#include <QDebug>
11
12#include "area.h"
13#include "debug_p.h"
14#include "misc.h"
15#include "page.h"
16#include "page_p.h"
17#include <unordered_set>
18
19#include <cstring>
20
21#include <QVarLengthArray>
22#include <QtAlgorithms>
23
24using namespace Okular;
25
26// Many of the strings are being reused; especially
27// those less than 2 letters are very common
28// Use the implicit shared bits from QString to
29// not keep multiple same strings around, but just up the
30// refcount a bit
31// The main reason for '2' is that most calls here happens
32// in auxillary threads that are following a document
33// and keeping the pool thread_local gives quite a bit
34// of advantage here.
35// Some calls though comes from the main thread, so we
36// shouldn't keep all the long strings allocated in the main
37// thread around forever.
38// '2' has been chosen by random testing, and guesswork.
39static QString fromPool(const QString &str)
40{
41 if (str.length() > 2) {
42 return str;
43 }
44
45 thread_local std::unordered_set<QString> pool;
46 auto [iterator, success] = pool.insert(str);
47 return *iterator;
48}
49
50class SearchPoint
51{
52public:
53 SearchPoint()
54 : offset_begin(-1)
55 , offset_end(-1)
56 {
57 }
58
59 /** The TextEntity containing the first character of the match. */
61
62 /** The TextEntity containing the last character of the match. */
64
65 /** The index of the first character of the match in (*it_begin)->text().
66 * Satisfies 0 <= offset_begin < (*it_begin)->text().length().
67 */
68 int offset_begin;
69
70 /** One plus the index of the last character of the match in (*it_end)->text().
71 * Satisfies 0 < offset_end <= (*it_end)->text().length().
72 */
73 int offset_end;
74};
75
76/* text comparison functions */
77
78static bool CaseInsensitiveCmpFn(QStringView from, QStringView to)
79{
80#ifdef DEBUG_TEXTPAGE
81 qDebug(OkularCoreDebug) << from << ":" << to << "(case insensitive)";
82#endif
83 return from.compare(to, Qt::CaseInsensitive) == 0;
84}
85
86static bool CaseSensitiveCmpFn(QStringView from, QStringView to)
87{
88#ifdef DEBUG_TEXTPAGE
89 qDebug(OkularCoreDebug) << from << ":" << to << "(case sensitive)";
90#endif
91 return from.compare(to, Qt::CaseSensitive) == 0;
92}
93
94/**
95 * Returns true iff segments [@p left1, @p right1] and [@p left2, @p right2] on the real line
96 * overlap within @p threshold percent, i. e. iff the ratio of the length of the
97 * intersection of the segments to the length of the shortest of the two input segments
98 * is not smaller than the threshold.
99 */
100static bool segmentsOverlap(double left1, double right1, double left2, double right2, int threshold)
101{
102 // check if one consumes another fully (speed optimization)
103
104 if (left1 <= left2 && right1 >= right2) {
105 return true;
106 }
107
108 if (left1 >= left2 && right1 <= right2) {
109 return true;
110 }
111
112 // check if there is overlap above threshold
113 if (right2 >= left1 && right1 >= left2) {
114 double overlap = (right2 >= right1) ? right1 - left2 : right2 - left1;
115
116 double length1 = right1 - left1, length2 = right2 - left2;
117
118 return overlap * 100 >= threshold * qMin(length1, length2);
119 }
120
121 return false;
122}
123
124static bool doesConsumeY(const QRect first, const QRect second, int threshold)
125{
126 return segmentsOverlap(first.top(), first.bottom(), second.top(), second.bottom(), threshold);
127}
128
129static bool doesConsumeY(const NormalizedRect &first, const NormalizedRect &second, int threshold)
130{
131 return segmentsOverlap(first.top, first.bottom, second.top, second.bottom, threshold);
132}
133
135 : m_text(fromPool(text))
136 , m_area(area)
137{
138}
139
140TextEntity::~TextEntity() = default;
141
143{
144 return m_text;
145}
146
148{
149 return m_area;
150}
151
153{
154 NormalizedRect transformed_area = m_area;
155 transformed_area.transform(matrix);
156 return transformed_area;
157}
158
159TextPagePrivate::TextPagePrivate()
160 : m_page(nullptr)
161{
162}
163
164TextPagePrivate::~TextPagePrivate()
165{
166 qDeleteAll(m_searchPoints);
167}
168
170 : d(new TextPagePrivate())
171{
172}
173
175 : d(new TextPagePrivate())
176{
177 d->m_words = words;
178}
179
181{
182 delete d;
183}
184
185void TextPage::append(const QString &text, const NormalizedRect &area)
186{
187 if (!text.isEmpty()) {
188 if (!d->m_words.isEmpty()) {
189 TextEntity &lastEntity = d->m_words.last();
190 const QString concatText = lastEntity.text() + text.normalized(QString::NormalizationForm_KC);
191 if (concatText != concatText.normalized(QString::NormalizationForm_KC)) {
192 // If this happens it means that the new text + old one have combined, for example A and ◌̊ form Å
193 NormalizedRect newArea = area | lastEntity.area();
194 d->m_words.removeLast();
195 d->m_words.append(TextEntity(concatText.normalized(QString::NormalizationForm_KC), newArea));
196 return;
197 }
198 }
199
200 d->m_words.append(TextEntity(text.normalized(QString::NormalizationForm_KC), area));
201 }
202}
203
204struct WordWithCharacters {
205 WordWithCharacters(const TextEntity &w, const TextEntity::List &c)
206 : word(w)
207 , characters(c)
208 {
209 }
210
211 inline QString text() const
212 {
213 return word.text();
214 }
215
216 inline NormalizedRect area() const
217 {
218 return word.area();
219 }
220
221 TextEntity word;
222 TextEntity::List characters;
223};
225
226/**
227 * We will divide the whole page in some regions depending on the horizontal and
228 * vertical spacing among different regions. Each region will have an area and an
229 * associated WordsWithCharacters in sorted order.
230 */
231class RegionText
232{
233public:
234 RegionText() {};
235
236 RegionText(const WordsWithCharacters &wordsWithCharacters, const QRect area)
237 : m_region_wordWithCharacters(wordsWithCharacters)
238 , m_area(area)
239 {
240 }
241
242 inline QString string() const
243 {
244 QString res;
245 for (const WordWithCharacters &word : m_region_wordWithCharacters) {
246 res += word.text();
247 }
248 return res;
249 }
250
251 inline WordsWithCharacters text() const
252 {
253 return m_region_wordWithCharacters;
254 }
255
256 inline QRect area() const
257 {
258 return m_area;
259 }
260
261 inline void setArea(const QRect area)
262 {
263 m_area = area;
264 }
265
266 inline void setText(const WordsWithCharacters &wordsWithCharacters)
267 {
268 m_region_wordWithCharacters = wordsWithCharacters;
269 }
270
271private:
272 WordsWithCharacters m_region_wordWithCharacters;
273 QRect m_area;
274};
275
276std::unique_ptr<RegularAreaRect> TextPage::textArea(const TextSelection &sel) const
277{
278 if (d->m_words.isEmpty()) {
279 return std::make_unique<RegularAreaRect>();
280 }
281
282 /**
283 It works like this:
284 There are two cursors, we need to select all the text between them. The coordinates are normalised, leftTop is (0,0)
285 rightBottom is (1,1), so for cursors start (sx,sy) and end (ex,ey) we start with finding text rectangles under those
286 points, if not we search for the first that is to the right to it in the same baseline, if none found, then we search
287 for the first rectangle with a baseline under the cursor, having two points that are the best rectangles to both
288 of the cursors: (rx,ry)x(tx,ty) for start and (ux,uy)x(vx,vy) for end, we do a
289 1. (rx,ry)x(1,ty)
290 2. (0,ty)x(1,uy)
291 3. (0,uy)x(vx,vy)
292
293 To find the closest rectangle to cursor (cx,cy) we search for a rectangle that either contains the cursor
294 or that has a left border >= cx and bottom border >= cy.
295 */
296 auto ret = std::make_unique<RegularAreaRect>();
297
298 PagePrivate *pagePrivate = PagePrivate::get(d->m_page);
299 const QTransform matrix = pagePrivate ? pagePrivate->rotationMatrix() : QTransform();
300 const double scaleX = d->m_page->width();
301 const double scaleY = d->m_page->height();
302
303 NormalizedPoint startC = sel.start();
304 NormalizedPoint endC = sel.end();
305
306 // if startPoint is right to endPoint swap them
307 if (startC.x > endC.x) {
308 std::swap(startC, endC);
309 }
310
311 // minX,maxX,minY,maxY gives the bounding rectangle coordinates of the document
312 const NormalizedRect boundingRect = d->m_page->boundingBox();
313 const QRect content = boundingRect.geometry(scaleX, scaleY);
314 const double minX = content.left();
315 const double maxX = content.right();
316 const double minY = content.top();
317 const double maxY = content.bottom();
318
319 /**
320 * We will now find out the TinyTextEntity for the startRectangle and TinyTextEntity for
321 * the endRectangle. We have four cases:
322 *
323 * Case 1(a): both startpoint and endpoint are out of the bounding Rectangle and at one side, so the rectangle made of start
324 * and endPoint are outof the bounding rect (do not intersect)
325 *
326 * Case 1(b): both startpoint and endpoint are out of bounding rect, but they are in different side, so is their rectangle
327 *
328 * Case 2(a): find the rectangle which contains start and endpoint and having some
329 * TextEntity
330 *
331 * Case 2(b): if 2(a) fails (if startPoint and endPoint both are unchanged), then we check whether there is any
332 * TextEntity within the rect made by startPoint and endPoint
333 *
334 * Case 3: Now, we may have two type of selection.
335 * 1. startpoint is left-top of start_end and endpoint is right-bottom
336 * 2. startpoint is left-bottom of start_end and endpoint is top-right
337 *
338 * Also, as 2(b) is passed, we might have it,itEnd or both unchanged, but the fact is that we have
339 * text within them. so, we need to search for the best suitable textposition for start and end.
340 *
341 * Case 3(a): We search the nearest rectangle consisting of some
342 * TinyTextEntity right to or bottom of the startPoint for selection 01.
343 * And, for selection 02, we have to search for right and top
344 *
345 * Case 3(b): For endpoint, we have to find the point top of or left to
346 * endpoint if we have selection 01.
347 * Otherwise, the search will be left and bottom
348 */
349
350 // we know that startC.x > endC.x, we need to decide which is top and which is bottom
351 const NormalizedRect start_end = (startC.y < endC.y) ? NormalizedRect(startC.x, startC.y, endC.x, endC.y) : NormalizedRect(startC.x, endC.y, endC.x, startC.y);
352
353 // Case 1(a)
354 if (!boundingRect.intersects(start_end)) {
355 return ret;
356 } else {
357 // case 1(b)
358 /**
359 note that, after swapping of start and end, we know that,
360 start is always left to end. but, we cannot say start is
361 positioned upper than end.
362 **/
363 // if start is left to content rect take it to content rect boundary
364 if (startC.x * scaleX < minX) {
365 startC.x = minX / scaleX;
366 }
367 if (endC.x * scaleX > maxX) {
368 endC.x = maxX / scaleX;
369 }
370
371 // if start is top to end (selection type 01)
372 if (startC.y * scaleY < minY) {
373 startC.y = minY / scaleY;
374 }
375 if (endC.y * scaleY > maxY) {
376 endC.y = maxY / scaleY;
377 }
378
379 // if start is bottom to end (selection type 02)
380 if (startC.y * scaleY > maxY) {
381 startC.y = maxY / scaleY;
382 }
383 if (endC.y * scaleY < minY) {
384 endC.y = minY / scaleY;
385 }
386 }
387
388 TextEntity::List::ConstIterator it = d->m_words.constBegin(), itEnd = d->m_words.constEnd();
389 TextEntity::List::ConstIterator start = it, end = itEnd, tmpIt = it; //, tmpItEnd = itEnd;
390 const MergeSide side = d->m_page ? (MergeSide)d->m_page->totalOrientation() : MergeRight;
391
392 NormalizedRect tmp;
393 // case 2(a)
394 for (; it != itEnd; ++it) {
395 tmp = it->area();
396 if (tmp.contains(startC.x, startC.y)) {
397 start = it;
398 }
399 if (tmp.contains(endC.x, endC.y)) {
400 end = it;
401 }
402 }
403
404 // case 2(b)
405 it = tmpIt;
406 if (start == it && end == itEnd) {
407 for (; it != itEnd; ++it) {
408 // is there any text rectangle within the start_end rect
409 tmp = it->area();
410 if (start_end.intersects(tmp)) {
411 break;
412 }
413 }
414
415 // we have searched every text entities, but none is within the rectangle created by start and end
416 // so, no selection should be done
417 if (it == itEnd) {
418 return ret;
419 }
420 }
421 it = tmpIt;
422 bool selection_two_start = false;
423
424 // case 3.a
425 if (start == it) {
426 NormalizedRect rect;
427
428 // selection type 01
429 if (startC.y <= endC.y) {
430 for (; it != itEnd; ++it) {
431 rect = it->area();
432 bool flagV = !rect.isBottom(startC);
433
434 if (flagV && rect.isRight(startC)) {
435 start = it;
436 break;
437 }
438 }
439 }
440
441 // selection type 02
442 else {
443 selection_two_start = true;
444 int distance = scaleX + scaleY + 100;
445
446 for (; it != itEnd; ++it) {
447 rect = it->area();
448
449 if (rect.isBottomOrLevel(startC) && rect.isRight(startC)) {
450 QRect entRect = rect.geometry(scaleX, scaleY);
451 int xdist, ydist;
452 xdist = entRect.center().x() - startC.x * scaleX;
453 ydist = entRect.center().y() - startC.y * scaleY;
454
455 // make them positive
456 if (xdist < 0) {
457 xdist = -xdist;
458 }
459 if (ydist < 0) {
460 ydist = -ydist;
461 }
462
463 if ((xdist + ydist) < distance) {
464 distance = xdist + ydist;
465 start = it;
466 }
467 }
468 }
469 }
470 }
471
472 // case 3.b
473 if (end == itEnd) {
474 it = tmpIt;
475 itEnd = itEnd - 1;
476
477 NormalizedRect rect;
478
479 if (startC.y <= endC.y) {
480 for (; itEnd >= it; itEnd--) {
481 rect = itEnd->area();
482 bool flagV = !rect.isTop(endC);
483
484 if (flagV && rect.isLeft(endC)) {
485 end = itEnd;
486 break;
487 }
488 }
489 }
490
491 else {
492 int distance = scaleX + scaleY + 100;
493 for (; itEnd >= it; itEnd--) {
494 rect = itEnd->area();
495
496 if (rect.isTopOrLevel(endC) && rect.isLeft(endC)) {
497 QRect entRect = rect.geometry(scaleX, scaleY);
498 int xdist, ydist;
499 xdist = entRect.center().x() - endC.x * scaleX;
500 ydist = entRect.center().y() - endC.y * scaleY;
501
502 // make them positive
503 if (xdist < 0) {
504 xdist = -xdist;
505 }
506 if (ydist < 0) {
507 ydist = -ydist;
508 }
509
510 if ((xdist + ydist) < distance) {
511 distance = xdist + ydist;
512 end = itEnd;
513 }
514 }
515 }
516 }
517 }
518
519 /* if start and end in selection 02 are in the same column, and we
520 start at an empty space we have to remove the selection of last
521 character
522 */
523 if (selection_two_start) {
524 if (start > end) {
525 start = start - 1;
526 }
527 }
528
529 // if start is less than end swap them
530 if (start > end) {
531 it = start;
532 start = end;
533 end = it;
534 }
535
536 // removes the possibility of crash, in case none of 1 to 3 is true
537 if (end == d->m_words.constEnd()) {
538 end--;
539 }
540
541 for (; start <= end; start++) {
542 ret->appendShape(start->transformedArea(matrix), side);
543 }
544
545 return ret;
546}
547
548RegularAreaRect *TextPage::findText(int searchID, const QString &query, SearchDirection direct, Qt::CaseSensitivity caseSensitivity, const RegularAreaRect *area)
549{
550 SearchDirection dir = direct;
551 // invalid search request
552 if (d->m_words.isEmpty() || query.isEmpty() || (area && area->isNull())) {
553 return nullptr;
554 }
556 int start_offset = 0;
558 const QMap<int, SearchPoint *>::const_iterator sIt = d->m_searchPoints.constFind(searchID);
559 if (sIt == d->m_searchPoints.constEnd()) {
560 // if no previous run of this search is found, then set it to start
561 // from the beginning (respecting the search direction)
562 if (dir == NextResult) {
563 dir = FromTop;
564 } else if (dir == PreviousResult) {
565 dir = FromBottom;
566 }
567 }
568 bool forward = true;
569 switch (dir) {
570 case FromTop:
571 start = d->m_words.constBegin();
572 start_offset = 0;
573 end = d->m_words.constEnd();
574 break;
575 case FromBottom:
576 start = d->m_words.constEnd();
577 start_offset = 0;
578 end = d->m_words.constBegin();
579 forward = false;
580 break;
581 case NextResult:
582 start = (*sIt)->it_end;
583 start_offset = (*sIt)->offset_end;
584 end = d->m_words.constEnd();
585 break;
586 case PreviousResult:
587 start = (*sIt)->it_begin;
588 start_offset = (*sIt)->offset_begin;
589 end = d->m_words.constBegin();
590 forward = false;
591 break;
592 };
593 RegularAreaRect *ret = nullptr;
594 const TextComparisonFunction cmpFn = caseSensitivity == Qt::CaseSensitive ? CaseSensitiveCmpFn : CaseInsensitiveCmpFn;
595 if (forward) {
596 ret = d->findTextInternalForward(searchID, query, cmpFn, start, start_offset, end);
597 } else {
598 ret = d->findTextInternalBackward(searchID, query, cmpFn, start, start_offset, end);
599 }
600 return ret;
601}
602
603// hyphenated '-' must be at the end of a word, so hyphenation means
604// we have a '-' just followed by a '\n' character
605// check if the string contains a '-' character
606// if the '-' is the last entry
607static int stringLengthAdaptedWithHyphen(const QString &str, TextEntity::List::ConstIterator it, TextEntity::List::ConstIterator textListEnd)
608{
609 const int len = str.length();
610
611 // hyphenated '-' must be at the end of a word, so hyphenation means
612 // we have a '-' just followed by a '\n' character
613 // check if the string contains a '-' character
614 // if the '-' is the last entry
615 if (str.endsWith(QLatin1Char('-'))) {
616 // validity chek of it + 1
617 if ((it + 1) != textListEnd) {
618 // 1. if the next character is '\n'
619 const QString &lookahedStr = (it + 1)->text();
620 if (lookahedStr.startsWith(QLatin1Char('\n'))) {
621 return len - 1;
622 }
623
624 // 2. if the next word is in a different line or not
625 const NormalizedRect &hyphenArea = it->area();
626 const NormalizedRect &lookaheadArea = (it + 1)->area();
627
628 // lookahead to check whether both the '-' rect and next character rect overlap
629 if (!doesConsumeY(hyphenArea, lookaheadArea, 70)) {
630 return len - 1;
631 }
632 }
633 }
634 // else if it is the second last entry - for example in pdf format
635 else if (str.endsWith(QLatin1String("-\n"))) {
636 return len - 2;
637 }
638
639 return len;
640}
641
642RegularAreaRect *TextPagePrivate::searchPointToArea(const SearchPoint *sp)
643{
644 PagePrivate *pagePrivate = PagePrivate::get(m_page);
645 const QTransform matrix = pagePrivate ? pagePrivate->rotationMatrix() : QTransform();
647
648 for (TextEntity::List::ConstIterator it = sp->it_begin;; it++) {
649 const TextEntity &curEntity = *it;
650 ret->append(curEntity.transformedArea(matrix));
651
652 if (it == sp->it_end) {
653 break;
654 }
655 }
656
657 ret->simplify();
658 return ret;
659}
660
661RegularAreaRect *TextPagePrivate::findTextInternalForward(int searchID, const QString &_query, TextComparisonFunction comparer, TextEntity::List::ConstIterator start, int start_offset, TextEntity::List::ConstIterator end)
662{
663 // normalize query search all unicode (including glyphs)
665
666 // j is the current position in our query
667 // queryLeft is the length of the query we have left to match
668 int j = 0, queryLeft = query.length();
669
671 int offset = start_offset;
672
674 int offset_begin = 0; // dummy initial value to suppress compiler warnings
675
676 while (it != end) {
677 const TextEntity &curEntity = *it;
678 const QString &str = curEntity.text();
679 const int strLen = str.length();
680 const int adjustedLen = stringLengthAdaptedWithHyphen(str, it, m_words.constEnd());
681 // adjustedLen <= strLen
682
683 if (offset >= strLen) {
684 it++;
685 offset = 0;
686 continue;
687 }
688
689 if (it_begin == TextEntity::List::ConstIterator()) {
690 it_begin = it;
691 offset_begin = offset;
692 }
693
694 // Let the user write the hyphen or not when searching for text
695 int matchedLen = -1;
696 for (int matchingLen = strLen; matchingLen >= adjustedLen; matchingLen--) {
697 // we have equal (or less than) area of the query left as the length of the current
698 // entity
699 const int min = qMin(queryLeft, matchingLen - offset);
700 if (comparer(QStringView {str}.mid(offset, min), QStringView {query}.mid(j, min))) {
701 matchedLen = min;
702 break;
703 }
704 }
705
706 if (matchedLen == -1) {
707 // we have not matched
708 // this means we do not have a complete match
709 // we need to get back to query start
710 // and continue the search from this place
711#ifdef DEBUG_TEXTPAGE
712 qCDebug(OkularCoreDebug) << "\tnot matched";
713#endif
714 j = 0;
715 queryLeft = query.length();
716 it = it_begin;
717 offset = offset_begin + 1;
719 } else {
720 // we have a match
721 // move the current position in the query
722 // to the position after the length of this string
723 // we matched
724 // subtract the length of the current entity from
725 // the left length of the query
726#ifdef DEBUG_TEXTPAGE
727 qCDebug(OkularCoreDebug) << "\tmatched" << matchedLen;
728#endif
729 j += matchedLen;
730 queryLeft -= matchedLen;
731
732 if (queryLeft == 0) {
733 // save or update the search point for the current searchID
734 QMap<int, SearchPoint *>::iterator sIt = m_searchPoints.find(searchID);
735 if (sIt == m_searchPoints.end()) {
736 sIt = m_searchPoints.insert(searchID, new SearchPoint);
737 }
738 SearchPoint *sp = *sIt;
739 sp->it_begin = it_begin;
740 sp->it_end = it;
741 sp->offset_begin = offset_begin;
742 sp->offset_end = offset + matchedLen;
743 return searchPointToArea(sp);
744 }
745
746 it++;
747 offset = 0;
748 }
749 }
750 // end of loop - it means that we've ended the textentities
751
752 const QMap<int, SearchPoint *>::iterator sIt = m_searchPoints.find(searchID);
753 if (sIt != m_searchPoints.end()) {
754 SearchPoint *sp = *sIt;
755 m_searchPoints.erase(sIt);
756 delete sp;
757 }
758 return nullptr;
759}
760
761RegularAreaRect *TextPagePrivate::findTextInternalBackward(int searchID, const QString &_query, TextComparisonFunction comparer, TextEntity::List::ConstIterator start, int start_offset, TextEntity::List::ConstIterator end)
762{
763 // normalize query to search all unicode (including glyphs)
765
766 // j is the current position in our query
767 // len is the length of the string in TextEntity
768 // queryLeft is the length of the query we have left
769 int j = query.length(), queryLeft = query.length();
770
772 int offset = start_offset;
773
775 int offset_begin = 0; // dummy initial value to suppress compiler warnings
776
777 while (true) {
778 if (offset <= 0) {
779 if (it == end) {
780 break;
781 }
782 it--;
783 }
784
785 const TextEntity &curEntity = *it;
786 const QString &str = curEntity.text();
787 const int strLen = str.length();
788 const int adjustedLen = stringLengthAdaptedWithHyphen(str, it, m_words.constEnd());
789 // adjustedLen <= strLen
790
791 if (offset <= 0) {
792 offset = strLen;
793 }
794
795 if (it_begin == TextEntity::List::ConstIterator()) {
796 it_begin = it;
797 offset_begin = offset;
798 }
799
800 // Let the user write the hyphen or not when searching for text
801 int matchedLen = -1;
802 // we have equal (or less than) area of the query left as the length of the current
803 // entity
804 for (int matchingLen = strLen; matchingLen >= adjustedLen; matchingLen--) {
805 const int hyphenOffset = (strLen - matchingLen);
806 const int min = qMin(queryLeft + hyphenOffset, offset);
807 if (comparer(QStringView {str}.mid(offset - min, min - hyphenOffset), QStringView {query}.mid(j - min + hyphenOffset, min - hyphenOffset))) {
808 matchedLen = min - hyphenOffset;
809 break;
810 }
811 }
812
813 if (matchedLen == -1) {
814 // we have not matched
815 // this means we do not have a complete match
816 // we need to get back to query start
817 // and continue the search from this place
818#ifdef DEBUG_TEXTPAGE
819 qCDebug(OkularCoreDebug) << "\tnot matched";
820#endif
821
822 j = query.length();
823 queryLeft = query.length();
824 it = it_begin;
825 offset = offset_begin - 1;
827 } else {
828 // we have a match
829 // move the current position in the query
830 // to the position after the length of this string
831 // we matched
832 // subtract the length of the current entity from
833 // the left length of the query
834#ifdef DEBUG_TEXTPAGE
835 qCDebug(OkularCoreDebug) << "\tmatched";
836#endif
837 j -= matchedLen;
838 queryLeft -= matchedLen;
839
840 if (queryLeft == 0) {
841 // save or update the search point for the current searchID
842 QMap<int, SearchPoint *>::iterator sIt = m_searchPoints.find(searchID);
843 if (sIt == m_searchPoints.end()) {
844 sIt = m_searchPoints.insert(searchID, new SearchPoint);
845 }
846 SearchPoint *sp = *sIt;
847 sp->it_begin = it;
848 sp->it_end = it_begin;
849 sp->offset_begin = offset - matchedLen;
850 sp->offset_end = offset_begin;
851 return searchPointToArea(sp);
852 }
853
854 offset = 0;
855 }
856 }
857 // end of loop - it means that we've ended the textentities
858
859 const QMap<int, SearchPoint *>::iterator sIt = m_searchPoints.find(searchID);
860 if (sIt != m_searchPoints.end()) {
861 SearchPoint *sp = *sIt;
862 m_searchPoints.erase(sIt);
863 delete sp;
864 }
865 return nullptr;
866}
867
869{
871}
872
874{
875 if (area && area->isNull()) {
876 return QString();
877 }
878
879 TextEntity::List::ConstIterator it = d->m_words.constBegin(), itEnd = d->m_words.constEnd();
880 QString ret;
881 if (area) {
882 for (; it != itEnd; ++it) {
884 if (area->intersects(it->area())) {
885 ret += it->text();
886 }
887 } else {
888 NormalizedPoint center = it->area().center();
889 if (area->contains(center.x, center.y)) {
890 ret += it->text();
891 }
892 }
893 }
894 } else {
895 for (; it != itEnd; ++it) {
896 ret += it->text();
897 }
898 }
899 return ret;
900}
901
902static bool compareTinyTextEntityX(const WordWithCharacters &first, const WordWithCharacters &second)
903{
904 QRect firstArea = first.area().roundedGeometry(1000, 1000);
905 QRect secondArea = second.area().roundedGeometry(1000, 1000);
906
907 return firstArea.left() < secondArea.left();
908}
909
910static bool compareTinyTextEntityY(const WordWithCharacters &first, const WordWithCharacters &second)
911{
912 const QRect firstArea = first.area().roundedGeometry(1000, 1000);
913 const QRect secondArea = second.area().roundedGeometry(1000, 1000);
914
915 return firstArea.top() < secondArea.top();
916}
917
918/**
919 * Sets a new world list. Deleting the contents of the old one
920 */
921void TextPagePrivate::setWordList(const TextEntity::List &list)
922{
923 m_words = list;
924}
925
926/**
927 * Remove all the spaces in between texts. It will make all the generators
928 * same, whether they save spaces(like pdf) or not(like djvu).
929 */
930static void removeSpace(TextEntity::List *words)
931{
932 TextEntity::List::Iterator it = words->begin();
933 const QString str(QLatin1Char(' '));
934
935 while (it != words->end()) {
936 if (it->text() == str) {
937 it = words->erase(it);
938 } else {
939 ++it;
940 }
941 }
942}
943
944/**
945 * We will read the TinyTextEntity from characters and try to create words from there.
946 * Note: characters might be already characters for some generators, but we will keep
947 * the nomenclature characters for the generator produced data. The resulting
948 * WordsWithCharacters memory has to be managed by the caller, both the
949 * WordWithCharacters::word and WordWithCharacters::characters contents
950 */
951static WordsWithCharacters makeWordFromCharacters(const TextEntity::List &characters, int pageWidth, int pageHeight)
952{
953 /**
954 * We will traverse characters and try to create words from the TinyTextEntities in it.
955 * We will search TinyTextEntity blocks and merge them until we get a
956 * space between two consecutive TinyTextEntities. When we get a space
957 * we can take it as a end of word. Then we store the word as a TinyTextEntity
958 * and keep it in newList.
959
960 * We create a RegionText named regionWord that contains the word and the characters associated with it and
961 * a rectangle area of the element in newList.
962
963 */
964 WordsWithCharacters wordsWithCharacters;
965
966 TextEntity::List::ConstIterator it = characters.begin(), itEnd = characters.end(), tmpIt;
967 int newLeft, newRight, newTop, newBottom;
968
969 for (; it != itEnd; it++) {
970 QString textString = it->text();
971 QString newString;
972 QRect lineArea = it->area().roundedGeometry(pageWidth, pageHeight);
973 QRect elementArea;
974 TextEntity::List wordCharacters;
975 tmpIt = it;
976 int space = 0;
977
978 while (!space) {
979 if (!textString.isEmpty()) {
980 newString.append(textString);
981
982 // when textString is the start of the word
983 if (tmpIt == it) {
984 NormalizedRect newRect(lineArea, pageWidth, pageHeight);
985 wordCharacters.append(TextEntity(textString.normalized(QString::NormalizationForm_KC), newRect));
986 } else {
987 NormalizedRect newRect(elementArea, pageWidth, pageHeight);
988 wordCharacters.append(TextEntity(textString.normalized(QString::NormalizationForm_KC), newRect));
989 }
990 }
991
992 ++it;
993
994 /*
995 we must have to put this line before the if condition of it==itEnd
996 otherwise the last character can be missed
997 */
998 if (it == itEnd) {
999 break;
1000 }
1001 elementArea = it->area().roundedGeometry(pageWidth, pageHeight);
1002 if (!doesConsumeY(elementArea, lineArea, 60)) {
1003 --it;
1004 break;
1005 }
1006
1007 const int text_y1 = elementArea.top(), text_x1 = elementArea.left(), text_y2 = elementArea.y() + elementArea.height(), text_x2 = elementArea.x() + elementArea.width();
1008 const int line_y1 = lineArea.top(), line_x1 = lineArea.left(), line_y2 = lineArea.y() + lineArea.height(), line_x2 = lineArea.x() + lineArea.width();
1009
1010 space = elementArea.left() - lineArea.right();
1011
1012 if (space != 0) {
1013 it--;
1014 break;
1015 }
1016
1017 newLeft = text_x1 < line_x1 ? text_x1 : line_x1;
1018 newRight = line_x2 > text_x2 ? line_x2 : text_x2;
1019 newTop = text_y1 > line_y1 ? line_y1 : text_y1;
1020 newBottom = text_y2 > line_y2 ? text_y2 : line_y2;
1021
1022 lineArea.setLeft(newLeft);
1023 lineArea.setTop(newTop);
1024 lineArea.setWidth(newRight - newLeft);
1025 lineArea.setHeight(newBottom - newTop);
1026
1027 textString = it->text();
1028 }
1029
1030 // if newString is not empty, save it
1031 if (!newString.isEmpty()) {
1032 const NormalizedRect newRect(lineArea, pageWidth, pageHeight);
1034 wordsWithCharacters.append(WordWithCharacters(word, wordCharacters));
1035 }
1036
1037 if (it == itEnd) {
1038 break;
1039 }
1040 }
1041
1042 wordsWithCharacters.shrink_to_fit();
1043 return wordsWithCharacters;
1044}
1045
1046/**
1047 * Create Lines from the words and sort them
1048 */
1049QList<QPair<WordsWithCharacters, QRect>> makeAndSortLines(const WordsWithCharacters &wordsTmp, int pageWidth, int pageHeight)
1050{
1051 /**
1052 * We cannot assume that the generator will give us texts in the right order.
1053 * We can only assume that we will get texts in the page and their bounding
1054 * rectangle. The texts can be character, word, half-word anything.
1055 * So, we need to:
1056 **
1057 * 1. Sort rectangles/boxes containing texts by y0(top)
1058 * 2. Create textline where there is y overlap between TinyTextEntity 's
1059 * 3. Within each line sort the TinyTextEntity 's by x0(left)
1060 */
1061
1063
1064 /*
1065 Make a new copy of the TextList in the words, so that the wordsTmp and lines do
1066 not contain same pointers for all the TinyTextEntity.
1067 */
1068 QList<WordWithCharacters> words = wordsTmp;
1069
1070 // Step 1
1071 std::sort(words.begin(), words.end(), compareTinyTextEntityY);
1072
1073 // Step 2
1074 QList<WordWithCharacters>::Iterator it = words.begin(), itEnd = words.end();
1075
1076 // for every non-space texts(characters/words) in the textList
1077 for (; it != itEnd; it++) {
1078 const QRect elementArea = (*it).area().roundedGeometry(pageWidth, pageHeight);
1079 bool found = false;
1080
1081 for (QPair<WordsWithCharacters, QRect> &linesI : lines) {
1082 /* the line area which will be expanded
1083 line_rects is only necessary to preserve the topmin and bottommax of all
1084 the texts in the line, left and right is not necessary at all
1085 */
1086 QRect &lineArea = linesI.second;
1087 const int text_y1 = elementArea.top(), text_y2 = elementArea.top() + elementArea.height(), text_x1 = elementArea.left(), text_x2 = elementArea.left() + elementArea.width();
1088 const int line_y1 = lineArea.top(), line_y2 = lineArea.top() + lineArea.height(), line_x1 = lineArea.left(), line_x2 = lineArea.left() + lineArea.width();
1089
1090 /*
1091 if the new text and the line has y overlapping parts of more than 70%,
1092 the text will be added to this line
1093 */
1094 if (doesConsumeY(elementArea, lineArea, 70)) {
1095 WordsWithCharacters &line = linesI.first;
1096 line.append(*it);
1097
1098 const int newLeft = line_x1 < text_x1 ? line_x1 : text_x1;
1099 const int newRight = line_x2 > text_x2 ? line_x2 : text_x2;
1100 const int newTop = line_y1 < text_y1 ? line_y1 : text_y1;
1101 const int newBottom = text_y2 > line_y2 ? text_y2 : line_y2;
1102
1103 lineArea = QRect(newLeft, newTop, newRight - newLeft, newBottom - newTop);
1104 found = true;
1105 }
1106
1107 if (found) {
1108 break;
1109 }
1110 }
1111
1112 /* when we have found a new line create a new TextList containing
1113 only one element and append it to the lines
1114 */
1115 if (!found) {
1116 lines.append(QPair<WordsWithCharacters, QRect>({*it}, elementArea));
1117 }
1118 }
1119
1120 // Step 3
1121 for (QPair<WordsWithCharacters, QRect> &line : lines) {
1123 std::sort(list.begin(), list.end(), compareTinyTextEntityX);
1124 }
1125 lines.shrink_to_fit();
1126
1127 return lines;
1128}
1129
1130/**
1131 * Calculate Statistical information from the lines we made previously
1132 */
1133static void calculateStatisticalInformation(const QList<WordWithCharacters> &words, int pageWidth, int pageHeight, int *word_spacing, int *line_spacing, int *col_spacing)
1134{
1135 /**
1136 * For the region, defined by line_rects and lines
1137 * 1. Make line statistical analysis to find the line spacing
1138 * 2. Make character statistical analysis to differentiate between
1139 * word spacing and column spacing.
1140 */
1141
1142 /**
1143 * Step 0
1144 */
1145 const QList<QPair<WordsWithCharacters, QRect>> sortedLines = makeAndSortLines(words, pageWidth, pageHeight);
1146
1147 /**
1148 * Step 1
1149 */
1150 QMap<int, int> line_space_stat;
1151 for (int i = 0; i < sortedLines.length(); i++) {
1152 const QRect rectUpper = sortedLines.at(i).second;
1153
1154 if (i + 1 == sortedLines.length()) {
1155 break;
1156 }
1157 const QRect rectLower = sortedLines.at(i + 1).second;
1158
1159 int linespace = rectLower.top() - (rectUpper.top() + rectUpper.height());
1160 if (linespace < 0) {
1161 linespace = -linespace;
1162 }
1163
1164 if (line_space_stat.contains(linespace)) {
1165 line_space_stat[linespace]++;
1166 } else {
1167 line_space_stat[linespace] = 1;
1168 }
1169 }
1170
1171 *line_spacing = 0;
1172 int weighted_count = 0;
1173 QMapIterator<int, int> iterate_linespace(line_space_stat);
1174
1175 while (iterate_linespace.hasNext()) {
1176 iterate_linespace.next();
1177 *line_spacing += iterate_linespace.value() * iterate_linespace.key();
1178 weighted_count += iterate_linespace.value();
1179 }
1180 if (*line_spacing != 0) {
1181 *line_spacing = (int)((double)*line_spacing / (double)weighted_count + 0.5);
1182 }
1183
1184 /**
1185 * Step 2
1186 */
1187 // We would like to use QMap instead of QHash as it will keep the keys sorted
1188 QMap<int, int> hor_space_stat;
1189 QMap<int, int> col_space_stat;
1190 QList<QList<QRect>> space_rects;
1191 QVector<QRect> max_hor_space_rects;
1192
1193 // Space in every line
1194 for (const QPair<WordsWithCharacters, QRect> &sortedLine : sortedLines) {
1195 const WordsWithCharacters list = sortedLine.first;
1196 QList<QRect> line_space_rects;
1197 int maxSpace = 0, minSpace = pageWidth;
1198
1199 // for every TinyTextEntity element in the line
1201 QRect max_area1, max_area2;
1202 // for every line
1203 for (; it != itEnd; it++) {
1204 const QRect area1 = (*it).area().roundedGeometry(pageWidth, pageHeight);
1205 if (it + 1 == itEnd) {
1206 break;
1207 }
1208
1209 const QRect area2 = (*(it + 1)).area().roundedGeometry(pageWidth, pageHeight);
1210 int space = area2.left() - area1.right();
1211
1212 if (space > maxSpace) {
1213 max_area1 = area1;
1214 max_area2 = area2;
1215 maxSpace = space;
1216 }
1217
1218 if (space < minSpace && space != 0) {
1219 minSpace = space;
1220 }
1221
1222 // if we found a real space, whose length is not zero and also less than the pageWidth
1223 if (space != 0 && space != pageWidth) {
1224 // increase the count of the space amount
1225 if (hor_space_stat.contains(space)) {
1226 hor_space_stat[space]++;
1227 } else {
1228 hor_space_stat[space] = 1;
1229 }
1230
1231 int left, right, top, bottom;
1232
1233 left = area1.right();
1234 right = area2.left();
1235
1236 top = area2.top() < area1.top() ? area2.top() : area1.top();
1237 bottom = area2.bottom() > area1.bottom() ? area2.bottom() : area1.bottom();
1238
1239 QRect rect(left, top, right - left, bottom - top);
1240 line_space_rects.append(rect);
1241 }
1242 }
1243
1244 space_rects.append(line_space_rects);
1245
1246 if (hor_space_stat.contains(maxSpace)) {
1247 if (hor_space_stat[maxSpace] != 1) {
1248 hor_space_stat[maxSpace]--;
1249 } else {
1250 hor_space_stat.remove(maxSpace);
1251 }
1252 }
1253
1254 if (maxSpace != 0) {
1255 if (col_space_stat.contains(maxSpace)) {
1256 col_space_stat[maxSpace]++;
1257 } else {
1258 col_space_stat[maxSpace] = 1;
1259 }
1260
1261 // store the max rect of each line
1262 const int left = max_area1.right();
1263 const int right = max_area2.left();
1264 const int top = (max_area1.top() > max_area2.top()) ? max_area2.top() : max_area1.top();
1265 const int bottom = (max_area1.bottom() < max_area2.bottom()) ? max_area2.bottom() : max_area1.bottom();
1266
1267 const QRect rect(left, top, right - left, bottom - top);
1268 max_hor_space_rects.append(rect);
1269 } else {
1270 max_hor_space_rects.append(QRect(0, 0, 0, 0));
1271 }
1272 }
1273
1274 // All the between word space counts are in hor_space_stat
1275 *word_spacing = 0;
1276 weighted_count = 0;
1277 QMapIterator<int, int> iterate(hor_space_stat);
1278
1279 while (iterate.hasNext()) {
1280 iterate.next();
1281
1282 if (iterate.key() > 0) {
1283 *word_spacing += iterate.value() * iterate.key();
1284 weighted_count += iterate.value();
1285 }
1286 }
1287 if (weighted_count) {
1288 *word_spacing = (int)((double)*word_spacing / (double)weighted_count + 0.5);
1289 }
1290
1291 *col_spacing = 0;
1292 QMapIterator<int, int> iterate_col(col_space_stat);
1293
1294 while (iterate_col.hasNext()) {
1295 iterate_col.next();
1296 if (iterate_col.value() > *col_spacing) {
1297 *col_spacing = iterate_col.value();
1298 }
1299 }
1300 *col_spacing = col_space_stat.key(*col_spacing);
1301
1302 // if there is just one line in a region, there is no point in dividing it
1303 if (sortedLines.length() == 1) {
1304 *word_spacing = *col_spacing;
1305 }
1306}
1307
1308/**
1309 * Implements the XY Cut algorithm for textpage segmentation
1310 * The resulting RegionTextList will contain RegionText whose WordsWithCharacters::word and
1311 * WordsWithCharacters::characters are reused from wordsWithCharacters (i.e. no new nor delete happens in this function)
1312 */
1313static RegionTextList XYCutForBoundingBoxes(const QList<WordWithCharacters> &wordsWithCharacters, int pageWidth, int pageHeight)
1314{
1315 RegionTextList tree;
1316 QRect contentRect(0, 0, pageWidth, pageHeight);
1317 const RegionText root(wordsWithCharacters, contentRect);
1318
1319 // start the tree with the root, it is our only region at the start
1320 tree.push_back(root);
1321
1322 int i = 0;
1323
1324 // while traversing the tree has not been ended
1325 while (i < tree.length()) {
1326 const RegionText node = tree.at(i);
1327 QRect regionRect = node.area();
1328
1329 /**
1330 * 1. calculation of projection profiles
1331 */
1332 // allocate the size of proj profiles and initialize with 0
1333 int size_proj_y = node.area().height();
1334 int size_proj_x = node.area().width();
1335 // dynamic memory allocation
1336 QVarLengthArray<int> proj_on_xaxis(size_proj_x);
1337 QVarLengthArray<int> proj_on_yaxis(size_proj_y);
1338
1339 for (int j = 0; j < size_proj_y; ++j) {
1340 proj_on_yaxis[j] = 0;
1341 }
1342 for (int j = 0; j < size_proj_x; ++j) {
1343 proj_on_xaxis[j] = 0;
1344 }
1345
1346 const QList<WordWithCharacters> list = node.text();
1347
1348 // Calculate tcx and tcy locally for each new region
1349 int word_spacing, line_spacing, column_spacing;
1350 calculateStatisticalInformation(list, pageWidth, pageHeight, &word_spacing, &line_spacing, &column_spacing);
1351
1352 const int tcx = word_spacing * 2;
1353 const int tcy = line_spacing * 2;
1354
1355 int maxX = 0, maxY = 0;
1356 int avgX = 0;
1357 int count;
1358
1359 // for every text in the region
1360 for (const WordWithCharacters &wwc : list) {
1361 const QRect entRect = wwc.area().geometry(pageWidth, pageHeight);
1362
1363 // calculate vertical projection profile proj_on_xaxis1
1364 for (int k = entRect.left(); k <= entRect.left() + entRect.width(); ++k) {
1365 if ((k - regionRect.left()) < size_proj_x && (k - regionRect.left()) >= 0) {
1366 proj_on_xaxis[k - regionRect.left()] += entRect.height();
1367 }
1368 }
1369
1370 // calculate horizontal projection profile in the same way
1371 for (int k = entRect.top(); k <= entRect.top() + entRect.height(); ++k) {
1372 if ((k - regionRect.top()) < size_proj_y && (k - regionRect.top()) >= 0) {
1373 proj_on_yaxis[k - regionRect.top()] += entRect.width();
1374 }
1375 }
1376 }
1377
1378 for (int j = 0; j < size_proj_y; ++j) {
1379 if (proj_on_yaxis[j] > maxY) {
1380 maxY = proj_on_yaxis[j];
1381 }
1382 }
1383
1384 avgX = count = 0;
1385 for (int j = 0; j < size_proj_x; ++j) {
1386 if (proj_on_xaxis[j] > maxX) {
1387 maxX = proj_on_xaxis[j];
1388 }
1389 if (proj_on_xaxis[j]) {
1390 count++;
1391 avgX += proj_on_xaxis[j];
1392 }
1393 }
1394 if (count) {
1395 avgX /= count;
1396 }
1397
1398 /**
1399 * 2. Cleanup Boundary White Spaces and removal of noise
1400 */
1401 int xbegin = 0, xend = size_proj_x - 1;
1402 int ybegin = 0, yend = size_proj_y - 1;
1403 while (xbegin < size_proj_x && proj_on_xaxis[xbegin] <= 0) {
1404 xbegin++;
1405 }
1406 while (xend >= 0 && proj_on_xaxis[xend] <= 0) {
1407 xend--;
1408 }
1409 while (ybegin < size_proj_y && proj_on_yaxis[ybegin] <= 0) {
1410 ybegin++;
1411 }
1412 while (yend >= 0 && proj_on_yaxis[yend] <= 0) {
1413 yend--;
1414 }
1415
1416 // update the regionRect
1417 int old_left = regionRect.left(), old_top = regionRect.top();
1418 regionRect.setLeft(old_left + xbegin);
1419 regionRect.setRight(old_left + xend);
1420 regionRect.setTop(old_top + ybegin);
1421 regionRect.setBottom(old_top + yend);
1422
1423 int tnx = (int)((double)avgX * 10.0 / 100.0 + 0.5), tny = 0;
1424 for (int j = 0; j < size_proj_x; ++j) {
1425 proj_on_xaxis[j] -= tnx;
1426 }
1427 for (int j = 0; j < size_proj_y; ++j) {
1428 proj_on_yaxis[j] -= tny;
1429 }
1430
1431 /**
1432 * 3. Find the Widest gap
1433 */
1434 int gap_hor = -1, pos_hor = -1;
1435 int begin = -1, end = -1;
1436
1437 // find all hor_gaps and find the maximum between them
1438 for (int j = 1; j < size_proj_y; ++j) {
1439 // transition from white to black
1440 if (begin >= 0 && proj_on_yaxis[j - 1] <= 0 && proj_on_yaxis[j] > 0) {
1441 end = j;
1442 }
1443
1444 // transition from black to white
1445 if (proj_on_yaxis[j - 1] > 0 && proj_on_yaxis[j] <= 0) {
1446 begin = j;
1447 }
1448
1449 if (begin > 0 && end > 0 && end - begin > gap_hor) {
1450 gap_hor = end - begin;
1451 pos_hor = (end + begin) / 2;
1452 begin = -1;
1453 end = -1;
1454 }
1455 }
1456
1457 begin = -1, end = -1;
1458 int gap_ver = -1, pos_ver = -1;
1459
1460 // find all the ver_gaps and find the maximum between them
1461 for (int j = 1; j < size_proj_x; ++j) {
1462 // transition from white to black
1463 if (begin >= 0 && proj_on_xaxis[j - 1] <= 0 && proj_on_xaxis[j] > 0) {
1464 end = j;
1465 }
1466
1467 // transition from black to white
1468 if (proj_on_xaxis[j - 1] > 0 && proj_on_xaxis[j] <= 0) {
1469 begin = j;
1470 }
1471
1472 if (begin > 0 && end > 0 && end - begin > gap_ver) {
1473 gap_ver = end - begin;
1474 pos_ver = (end + begin) / 2;
1475 begin = -1;
1476 end = -1;
1477 }
1478 }
1479
1480 int cut_pos_x = pos_ver, cut_pos_y = pos_hor;
1481 int gap_x = gap_ver, gap_y = gap_hor;
1482
1483 /**
1484 * 4. Cut the region and make nodes (left,right) or (up,down)
1485 */
1486 bool cut_hor = false, cut_ver = false;
1487
1488 // For horizontal cut
1489 const int topHeight = cut_pos_y - (regionRect.top() - old_top);
1490 const QRect topRect(regionRect.left(), regionRect.top(), regionRect.width(), topHeight);
1491 const QRect bottomRect(regionRect.left(), regionRect.top() + topHeight, regionRect.width(), regionRect.height() - topHeight);
1492
1493 // For vertical Cut
1494 const int leftWidth = cut_pos_x - (regionRect.left() - old_left);
1495 const QRect leftRect(regionRect.left(), regionRect.top(), leftWidth, regionRect.height());
1496 const QRect rightRect(regionRect.left() + leftWidth, regionRect.top(), regionRect.width() - leftWidth, regionRect.height());
1497
1498 if (gap_y >= gap_x && gap_y >= tcy) {
1499 cut_hor = true;
1500 } else if (gap_y >= gap_x && gap_y <= tcy && gap_x >= tcx) {
1501 cut_ver = true;
1502 } else if (gap_x >= gap_y && gap_x >= tcx) {
1503 cut_ver = true;
1504 } else if (gap_x >= gap_y && gap_x <= tcx && gap_y >= tcy) {
1505 cut_hor = true;
1506 } else {
1507 // no cut possible
1508 // we can now update the node rectangle with the shrinked rectangle
1509 RegionText tmpNode = tree.at(i);
1510 tmpNode.setArea(regionRect);
1511 tree.replace(i, tmpNode);
1512 i++;
1513 continue;
1514 }
1515
1516 WordsWithCharacters list1, list2;
1517
1518 // horizontal cut, topRect and bottomRect
1519 if (cut_hor) {
1520 for (const WordWithCharacters &word : list) {
1521 const QRect wordRect = word.area().geometry(pageWidth, pageHeight);
1522
1523 if (topRect.intersects(wordRect)) {
1524 list1.append(word);
1525 } else {
1526 list2.append(word);
1527 }
1528 }
1529
1530 RegionText node1(list1, topRect);
1531 RegionText node2(list2, bottomRect);
1532
1533 tree.replace(i, node1);
1534 tree.insert(i + 1, node2);
1535 }
1536
1537 // vertical cut, leftRect and rightRect
1538 else if (cut_ver) {
1539 for (const WordWithCharacters &word : list) {
1540 const QRect wordRect = word.area().geometry(pageWidth, pageHeight);
1541
1542 if (leftRect.intersects(wordRect)) {
1543 list1.append(word);
1544 } else {
1545 list2.append(word);
1546 }
1547 }
1548
1549 RegionText node1(list1, leftRect);
1550 RegionText node2(list2, rightRect);
1551
1552 tree.replace(i, node1);
1553 tree.insert(i + 1, node2);
1554 }
1555 }
1556
1557 tree.shrink_to_fit();
1558 return tree;
1559}
1560
1561/**
1562 * Add spaces in between words in a line. It reuses the pointers passed in tree and might add new ones. You will need to take care of deleting them if needed
1563 */
1564TextEntity::List addNecessarySpace(RegionTextList tree, int pageWidth, int pageHeight)
1565{
1566 /**
1567 * 1. Call makeAndSortLines before adding spaces in between words in a line
1568 * 2. Now add spaces between every two words in a line
1569 * 3. Finally, extract all the space separated texts from each region and return it
1570 */
1571
1572 TextEntity::List res;
1573 // Only change the texts under RegionTexts, not the area
1574 for (const RegionText &tmpRegion : std::as_const(tree)) {
1575 // Step 01
1576 QList<QPair<WordsWithCharacters, QRect>> sortedLines = makeAndSortLines(tmpRegion.text(), pageWidth, pageHeight);
1577 int counter = 0;
1578
1579 // Step 02
1580 for (QPair<WordsWithCharacters, QRect> &sortedLine : sortedLines) {
1581 WordsWithCharacters &list = sortedLine.first;
1582 for (int k = 0; k < list.length(); k++) {
1583 const QRect area1 = list.at(k).area().roundedGeometry(pageWidth, pageHeight);
1584 if (k + 1 >= list.length()) {
1585 break;
1586 }
1587
1588 const QRect area2 = list.at(k + 1).area().roundedGeometry(pageWidth, pageHeight);
1589 const int space = area2.left() - area1.right();
1590
1591 if (space != 0) {
1592 // Make a TinyTextEntity of string space and push it between it and it+1
1593 const int left = area1.right();
1594 const int right = area2.left();
1595 const int top = area2.top() < area1.top() ? area2.top() : area1.top();
1596 const int bottom = area2.bottom() > area1.bottom() ? area2.bottom() : area1.bottom();
1597
1598 const QString spaceStr(QStringLiteral(" "));
1599 const QRect rect(QPoint(left, top), QPoint(right, bottom));
1600 const NormalizedRect entRect(rect, pageWidth, pageHeight);
1601 TextEntity ent1 = TextEntity(spaceStr, entRect);
1602 WordWithCharacters word(ent1, QList<TextEntity>() << ent1);
1603
1604 list.insert(k + 1, word);
1605
1606 // Skip the space
1607 k++;
1608 }
1609 }
1610 counter += list.length();
1611 }
1612 res.reserve(res.length() + counter);
1613
1614 for (const QPair<WordsWithCharacters, QRect> &sortedLine : std::as_const(sortedLines)) {
1615 for (const WordWithCharacters &word : sortedLine.first) {
1616 res += word.characters;
1617 }
1618 }
1619 }
1620
1621 // Step 03
1622 res.shrink_to_fit();
1623 return res;
1624}
1625
1626/**
1627 * Correct the textOrder, all layout recognition works here
1628 */
1629void TextPagePrivate::correctTextOrder()
1630{
1631 // m_page->width() and m_page->height() are in pixels at
1632 // 100% zoom level, and thus depend on display DPI.
1633 // To avoid Okular failing on lowDPI displays,
1634 // we scale pageWidth and pageHeight so their sum equals 2000.
1635 const double scalingFactor = 2000.0 / (m_page->width() + m_page->height());
1636 const int pageWidth = (int)(scalingFactor * m_page->width());
1637 const int pageHeight = (int)(scalingFactor * m_page->height());
1638
1639 TextEntity::List characters = m_words;
1640
1641 /**
1642 * Remove spaces from the text
1643 */
1644 removeSpace(&characters);
1645
1646 /**
1647 * Construct words from characters
1648 */
1649 const QList<WordWithCharacters> wordsWithCharacters = makeWordFromCharacters(characters, pageWidth, pageHeight);
1650
1651 /**
1652 * Make a XY Cut tree for segmentation of the texts
1653 */
1654 RegionTextList tree = XYCutForBoundingBoxes(wordsWithCharacters, pageWidth, pageHeight);
1655
1656 /**
1657 * Add spaces to the word
1658 */
1659 const auto listOfCharacters = addNecessarySpace(std::move(tree), pageWidth, pageHeight);
1660
1661 setWordList(listOfCharacters);
1662}
1663
1665{
1666 if (area && area->isNull()) {
1667 return TextEntity::List();
1668 }
1669
1670 TextEntity::List ret;
1671 if (area) {
1672 for (const TextEntity &te : std::as_const(d->m_words)) {
1674 if (area->intersects(te.area())) {
1675 ret.append(te);
1676 }
1677 } else {
1678 const NormalizedPoint center = te.area().center();
1679 if (area->contains(center.x, center.y)) {
1680 ret.append(te);
1681 }
1682 }
1683 }
1684 } else {
1685 for (const TextEntity &te : std::as_const(d->m_words)) {
1686 ret.append(te);
1687 }
1688 }
1689 return ret;
1690}
1691
1692std::unique_ptr<RegularAreaRect> TextPage::wordAt(const NormalizedPoint &p) const
1693{
1694 TextEntity::List::ConstIterator itBegin = d->m_words.constBegin(), itEnd = d->m_words.constEnd();
1696 TextEntity::List::ConstIterator posIt = itEnd;
1697 for (; it != itEnd; ++it) {
1698 if (it->area().contains(p.x, p.y)) {
1699 posIt = it;
1700 break;
1701 }
1702 }
1703 if (posIt != itEnd) {
1704 if (posIt->text().simplified().isEmpty()) {
1705 return nullptr;
1706 }
1707 // Find the first TinyTextEntity of the word
1708 while (posIt != itBegin) {
1709 --posIt;
1710 const QString itText = posIt->text();
1711 if (itText.right(1).at(0).isSpace()) {
1712 if (itText.endsWith(QLatin1String("-\n"))) {
1713 // Is an hyphenated word
1714 // continue searching the start of the word back
1715 continue;
1716 }
1717
1718 if (itText == QLatin1String("\n") && posIt != itBegin) {
1719 --posIt;
1720 if (posIt->text().endsWith(QLatin1String("-"))) {
1721 // Is an hyphenated word
1722 // continue searching the start of the word back
1723 continue;
1724 }
1725 ++posIt;
1726 }
1727
1728 ++posIt;
1729 break;
1730 }
1731 }
1732 auto ret = std::make_unique<RegularAreaRect>();
1733 QString foundWord;
1734 for (; posIt != itEnd; ++posIt) {
1735 const QString itText = posIt->text();
1736 if (itText.simplified().isEmpty()) {
1737 break;
1738 }
1739
1740 ret->appendShape(posIt->area());
1741 foundWord += posIt->text();
1742 if (itText.right(1).at(0).isSpace()) {
1743 if (!foundWord.endsWith(QLatin1String("-\n"))) {
1744 break;
1745 }
1746 }
1747 }
1748
1749 return ret;
1750 } else {
1751 return nullptr;
1752 }
1753}
NormalizedPoint is a helper class which stores the coordinates of a normalized point.
Definition area.h:117
double x
The normalized x coordinate.
Definition area.h:166
double y
The normalized y coordinate.
Definition area.h:171
A NormalizedRect is a rectangle which can be defined by two NormalizedPoints.
Definition area.h:189
double bottom
The normalized bottom coordinate.
Definition area.h:435
bool isRight(const NormalizedPoint &pt) const
Returns true if the point pt is located to the left of the right edge of the rectangle.
Definition area.h:376
bool isLeft(const NormalizedPoint &pt) const
Returns true if the point pt is located to the right of the left edge of the rectangle.
Definition area.h:367
bool contains(double x, double y) const
Returns whether the normalized rectangle contains the normalized point (x, y).
Definition area.cpp:160
bool isTop(const NormalizedPoint &pt) const
Returns true if the point pt is located above the top of the rectangle.
Definition area.h:340
bool intersects(const NormalizedRect &other) const
Returns whether the normalized rectangle intersects the other normalized rectangle.
Definition area.cpp:165
bool isTopOrLevel(const NormalizedPoint &pt) const
Returns true if the point pt is located above the bottom of the rectangle.
Definition area.h:358
bool isBottomOrLevel(const NormalizedPoint &pt) const
Returns true if the point pt is located below the top of the rectangle.
Definition area.h:349
QRect geometry(int xScale, int yScale) const
Returns the rectangle mapped to a reference area of xScale x yScale.
Definition area.cpp:232
double top
The normalized top coordinate.
Definition area.h:425
bool isBottom(const NormalizedPoint &pt) const
Returns true if the point pt is located below the bottom of the rectangle.
Definition area.h:331
QRect roundedGeometry(int xScale, int yScale) const
Same functionality as geometry, but the output is now rounded before typecasting to int.
Definition area.cpp:239
void transform(const QTransform &matrix)
Transforms the normalized rectangle with the operations defined by matrix.
Definition area.cpp:253
This is a list of NormalizedRect, to describe an area consisting of multiple rectangles using normali...
Definition area.h:933
void appendShape(const NormalizedShape &shape, MergeSide side=MergeAll)
Appends the given shape to this area.
Definition area.h:810
bool isNull() const
Returns whether the regular area is a null area.
Definition area.h:751
void simplify()
Simplifies this regular area by merging its intersecting subareas.
Definition area.h:730
bool contains(double x, double y) const
Returns whether this area contains the normalized point (x, y).
Definition area.h:866
bool intersects(const RegularArea< NormalizedShape, Shape > *area) const
Returns whether this area intersects with the given area.
Definition area.h:783
Represents a piece of text on a TextPage, containing its textual representation and its bounding box.
Definition textpage.h:53
QString text() const
Returns the text of the text entity.
Definition textpage.cpp:142
TextEntity(const QString &text, const NormalizedRect &area)
Creates a new text entity with the given text and the given area.
Definition textpage.cpp:134
NormalizedRect area() const
Returns the bounding area of the text entity.
Definition textpage.cpp:147
~TextEntity()
Destroys the text entity.
NormalizedRect transformedArea(const QTransform &matrix) const
Returns the transformed area of the text entity.
Definition textpage.cpp:152
TextEntity::List words(const RegularAreaRect *area, TextAreaInclusionBehaviour b) const
Text entity extraction function.
TextAreaInclusionBehaviour
Defines the behaviour of adding characters to text() result.
Definition textpage.h:113
@ AnyPixelTextAreaInclusionBehaviour
A character is included into text() result if any pixel of his bounding box is in the given area.
Definition textpage.h:114
QString text(const RegularAreaRect *area=nullptr) const
Text extraction function.
Definition textpage.cpp:868
std::unique_ptr< RegularAreaRect > wordAt(const NormalizedPoint &p) const
Returns the area and text of the word at the given point Note that ownership of the returned area bel...
RegularAreaRect * findText(int searchID, const QString &query, SearchDirection direction, Qt::CaseSensitivity caseSensitivity, const RegularAreaRect *area)
Returns the bounding rect of the text which matches the following criteria or 0 if the search is not ...
Definition textpage.cpp:548
std::unique_ptr< RegularAreaRect > textArea(const TextSelection &selection) const
Returns the rectangular area of the given selection.
Definition textpage.cpp:276
void append(const QString &text, const NormalizedRect &area)
Appends the given text with the given area as new TextEntity to the page.
Definition textpage.cpp:185
~TextPage()
Destroys the text page.
Definition textpage.cpp:180
TextPage()
Creates a new text page.
Definition textpage.cpp:169
Wrapper around the information needed to generate the selection area.
Definition misc.h:19
NormalizedPoint end() const
Returns the end point of the selection.
Definition misc.cpp:42
NormalizedPoint start() const
Returns the start point of the selection.
Definition misc.cpp:37
Q_SCRIPTABLE Q_NOREPLY void start()
KSERVICE_EXPORT KService::List query(FilterFunc filterFunc)
KIOCORE_EXPORT QStringList list(const QString &fileClass)
const QList< QKeySequence > & begin()
const QList< QKeySequence > & end()
global.h
Definition action.h:17
SearchDirection
Describes the direction of searching.
Definition global.h:36
@ FromTop
Searching from top of the page, next result is to be found, there was no earlier search result.
Definition global.h:37
@ FromBottom
Searching from bottom of the page, next result is to be found, there was no earlier search result.
Definition global.h:38
@ PreviousResult
Searching for the previous result on the page, earlier result should be located so we search from the...
Definition global.h:40
@ NextResult
Searching for the next result on the page, earlier result should be located so we search from the las...
Definition global.h:39
MergeSide
The side(s) to be considered when merging areas.
Definition global.h:64
@ MergeRight
Merge only if the right side of the first area intersect.
Definition global.h:65
bool isSpace(char32_t ucs4)
typedef ConstIterator
typedef Iterator
void append(QList< T > &&value)
const_reference at(qsizetype i) const const
iterator begin()
iterator end()
iterator erase(const_iterator begin, const_iterator end)
T & first()
iterator insert(const_iterator before, parameter_type value)
bool isEmpty() const const
qsizetype length() const const
QList< T > mid(qsizetype pos, qsizetype length) const const
void reserve(qsizetype size)
void shrink_to_fit()
bool contains(const Key &key) const const
Key key(const T &value, const Key &defaultKey) const const
size_type remove(const Key &key)
int x() const const
int y() const const
int bottom() const const
QPoint center() const const
int height() const const
int left() const const
int right() const const
void setBottom(int y)
void setHeight(int height)
void setLeft(int x)
void setRight(int x)
void setTop(int y)
void setWidth(int width)
int top() const const
int width() const const
int x() const const
int y() const const
NormalizationForm_KC
QString & append(QChar ch)
const QChar at(qsizetype position) const const
bool endsWith(QChar c, Qt::CaseSensitivity cs) const const
QString & insert(qsizetype position, QChar ch)
bool isEmpty() const const
qsizetype length() const const
QString mid(qsizetype position, qsizetype n) const const
QString normalized(NormalizationForm mode, QChar::UnicodeVersion version) const const
QString right(qsizetype n) const const
QString simplified() const const
bool startsWith(QChar c, Qt::CaseSensitivity cs) const const
int compare(QChar ch) const const
CaseInsensitive
QTextStream & left(QTextStream &stream)
QTextStream & right(QTextStream &stream)
This file is part of the KDE documentation.
Documentation copyright © 1996-2024 The KDE developers.
Generated on Tue Mar 26 2024 11:17:35 by doxygen 1.10.0 written by Dimitri van Heesch, © 1997-2006

KDE's Doxygen guidelines are available online.