Okular

textpage.cpp
1/*
2 SPDX-FileCopyrightText: 2005 Piotr Szymanski <niedakh@gmail.com>
3
4 SPDX-License-Identifier: GPL-2.0-or-later
5*/
6
7#include "textpage.h"
8#include "textpage_p.h"
9
10#include <QDebug>
11
12#include "area.h"
13#include "debug_p.h"
14#include "misc.h"
15#include "page.h"
16#include "page_p.h"
17#include <unordered_set>
18
19#include <cstring>
20
21#include <QVarLengthArray>
22#include <QtAlgorithms>
23
24using namespace Okular;
25
26// Many of the strings are being reused; especially
27// those less than 2 letters are very common
28// Use the implicit shared bits from QString to
29// not keep multiple same strings around, but just up the
30// refcount a bit
31// The main reason for '2' is that most calls here happens
32// in auxillary threads that are following a document
33// and keeping the pool thread_local gives quite a bit
34// of advantage here.
35// Some calls though comes from the main thread, so we
36// shouldn't keep all the long strings allocated in the main
37// thread around forever.
38// '2' has been chosen by random testing, and guesswork.
39static QString fromPool(const QString &str)
40{
41 if (str.length() > 2) {
42 return str;
43 }
44
45 thread_local std::unordered_set<QString> pool;
46 auto [iterator, success] = pool.insert(str);
47 return *iterator;
48}
49
50class SearchPoint
51{
52public:
53 SearchPoint()
54 : offset_begin(-1)
55 , offset_end(-1)
56 {
57 }
58
59 /** The TextEntity containing the first character of the match. */
61
62 /** The TextEntity containing the last character of the match. */
64
65 /** The index of the first character of the match in (*it_begin)->text().
66 * Satisfies 0 <= offset_begin < (*it_begin)->text().length().
67 */
68 int offset_begin;
69
70 /** One plus the index of the last character of the match in (*it_end)->text().
71 * Satisfies 0 < offset_end <= (*it_end)->text().length().
72 */
73 int offset_end;
74};
75
76/* text comparison functions */
77
78static bool CaseInsensitiveCmpFn(QStringView from, QStringView to)
79{
80#ifdef DEBUG_TEXTPAGE
81 qDebug(OkularCoreDebug) << from << ":" << to << "(case insensitive)";
82#endif
83 return from.compare(to, Qt::CaseInsensitive) == 0;
84}
85
86static bool CaseSensitiveCmpFn(QStringView from, QStringView to)
87{
88#ifdef DEBUG_TEXTPAGE
89 qDebug(OkularCoreDebug) << from << ":" << to << "(case sensitive)";
90#endif
91 return from.compare(to, Qt::CaseSensitive) == 0;
92}
93
94/**
95 * Returns true iff segments [@p left1, @p right1] and [@p left2, @p right2] on the real line
96 * overlap within @p threshold percent, i. e. iff the ratio of the length of the
97 * intersection of the segments to the length of the shortest of the two input segments
98 * is not smaller than the threshold.
99 */
100static bool segmentsOverlap(double left1, double right1, double left2, double right2, int threshold)
101{
102 // check if one consumes another fully (speed optimization)
103
104 if (left1 <= left2 && right1 >= right2) {
105 return true;
106 }
107
108 if (left1 >= left2 && right1 <= right2) {
109 return true;
110 }
111
112 // check if there is overlap above threshold
113 if (right2 >= left1 && right1 >= left2) {
114 double overlap = (right2 >= right1) ? right1 - left2 : right2 - left1;
115
116 double length1 = right1 - left1, length2 = right2 - left2;
117
118 return overlap * 100 >= threshold * qMin(length1, length2);
119 }
120
121 return false;
122}
123
124static bool doesConsumeY(const QRect first, const QRect second, int threshold)
125{
126 return segmentsOverlap(first.top(), first.bottom(), second.top(), second.bottom(), threshold);
127}
128
129static bool doesConsumeY(const NormalizedRect &first, const NormalizedRect &second, int threshold)
130{
131 return segmentsOverlap(first.top, first.bottom, second.top, second.bottom, threshold);
132}
133
135 : m_text(fromPool(text))
136 , m_area(area)
137{
138}
139
140TextEntity::~TextEntity() = default;
141
143{
144 return m_text;
145}
146
148{
149 return m_area;
150}
151
153{
154 NormalizedRect transformed_area = m_area;
155 transformed_area.transform(matrix);
156 return transformed_area;
157}
158
159TextPagePrivate::TextPagePrivate()
160 : m_page(nullptr)
161{
162}
163
164TextPagePrivate::~TextPagePrivate()
165{
166 qDeleteAll(m_searchPoints);
167}
168
170 : d(new TextPagePrivate())
171{
172}
173
175 : d(new TextPagePrivate())
176{
177 d->m_words = words;
178}
179
181{
182 delete d;
183}
184
185void TextPage::append(const QString &text, const NormalizedRect &area)
186{
187 if (!text.isEmpty()) {
188 if (!d->m_words.isEmpty()) {
189 const TextEntity &lastEntity = d->m_words.last();
190 // Unicode Normalization Form KC (NFKC) may alter characters, for example ⑥ to 6, so we use NFC
191 const QString concatText = lastEntity.text() + text.normalized(QString::NormalizationForm_C);
192 if (concatText != concatText.normalized(QString::NormalizationForm_C)) {
193 // If this happens it means that the new text + old one have combined, for example A and ◌̊ form Å
194 NormalizedRect newArea = area | lastEntity.area();
195 d->m_words.removeLast();
196 d->m_words.append(TextEntity(concatText.normalized(QString::NormalizationForm_C), newArea));
197 return;
198 }
199 }
200
201 d->m_words.append(TextEntity(text.normalized(QString::NormalizationForm_C), area));
202 }
203}
204
205struct WordWithCharacters {
206 WordWithCharacters(const TextEntity &w, const TextEntity::List &c)
207 : word(w)
208 , characters(c)
209 {
210 }
211
212 inline QString text() const
213 {
214 return word.text();
215 }
216
217 inline NormalizedRect area() const
218 {
219 return word.area();
220 }
221
222 TextEntity word;
223 TextEntity::List characters;
224};
226
227/**
228 * We will divide the whole page in some regions depending on the horizontal and
229 * vertical spacing among different regions. Each region will have an area and an
230 * associated WordsWithCharacters in sorted order.
231 */
232class RegionText
233{
234public:
235 RegionText() {};
236
237 RegionText(const WordsWithCharacters &wordsWithCharacters, const QRect area)
238 : m_region_wordWithCharacters(wordsWithCharacters)
239 , m_area(area)
240 {
241 }
242
243 inline QString string() const
244 {
245 QString res;
246 for (const WordWithCharacters &word : m_region_wordWithCharacters) {
247 res += word.text();
248 }
249 return res;
250 }
251
252 inline WordsWithCharacters text() const
253 {
254 return m_region_wordWithCharacters;
255 }
256
257 inline QRect area() const
258 {
259 return m_area;
260 }
261
262 inline void setArea(const QRect area)
263 {
264 m_area = area;
265 }
266
267 inline void setText(const WordsWithCharacters &wordsWithCharacters)
268 {
269 m_region_wordWithCharacters = wordsWithCharacters;
270 }
271
272private:
273 WordsWithCharacters m_region_wordWithCharacters;
274 QRect m_area;
275};
276
277std::unique_ptr<RegularAreaRect> TextPage::textArea(const TextSelection &sel) const
278{
279 if (d->m_words.isEmpty()) {
280 return std::make_unique<RegularAreaRect>();
281 }
282
283 /**
284 It works like this:
285 There are two cursors, we need to select all the text between them. The coordinates are normalised, leftTop is (0,0)
286 rightBottom is (1,1), so for cursors start (sx,sy) and end (ex,ey) we start with finding text rectangles under those
287 points, if not we search for the first that is to the right to it in the same baseline, if none found, then we search
288 for the first rectangle with a baseline under the cursor, having two points that are the best rectangles to both
289 of the cursors: (rx,ry)x(tx,ty) for start and (ux,uy)x(vx,vy) for end, we do a
290 1. (rx,ry)x(1,ty)
291 2. (0,ty)x(1,uy)
292 3. (0,uy)x(vx,vy)
293
294 To find the closest rectangle to cursor (cx,cy) we search for a rectangle that either contains the cursor
295 or that has a left border >= cx and bottom border >= cy.
296 */
297 auto ret = std::make_unique<RegularAreaRect>();
298
299 const PagePrivate *pagePrivate = PagePrivate::get(d->m_page);
300 const QTransform matrix = pagePrivate ? pagePrivate->rotationMatrix() : QTransform();
301 const double scaleX = d->m_page->width();
302 const double scaleY = d->m_page->height();
303
304 NormalizedPoint startC = sel.start();
305 NormalizedPoint endC = sel.end();
306
307 // if startPoint is right to endPoint swap them
308 if (startC.x > endC.x) {
309 std::swap(startC, endC);
310 }
311
312 // minX,maxX,minY,maxY gives the bounding rectangle coordinates of the document
313 const NormalizedRect boundingRect = d->m_page->boundingBox();
314 const QRect content = boundingRect.geometry(scaleX, scaleY);
315 const double minX = content.left();
316 const double maxX = content.right();
317 const double minY = content.top();
318 const double maxY = content.bottom();
319
320 /**
321 * We will now find out the TinyTextEntity for the startRectangle and TinyTextEntity for
322 * the endRectangle. We have four cases:
323 *
324 * Case 1(a): both startpoint and endpoint are out of the bounding Rectangle and at one side, so the rectangle made of start
325 * and endPoint are outof the bounding rect (do not intersect)
326 *
327 * Case 1(b): both startpoint and endpoint are out of bounding rect, but they are in different side, so is their rectangle
328 *
329 * Case 2(a): find the rectangle which contains start and endpoint and having some
330 * TextEntity
331 *
332 * Case 2(b): if 2(a) fails (if startPoint and endPoint both are unchanged), then we check whether there is any
333 * TextEntity within the rect made by startPoint and endPoint
334 *
335 * Case 3: Now, we may have two type of selection.
336 * 1. startpoint is left-top of start_end and endpoint is right-bottom
337 * 2. startpoint is left-bottom of start_end and endpoint is top-right
338 *
339 * Also, as 2(b) is passed, we might have it,itEnd or both unchanged, but the fact is that we have
340 * text within them. so, we need to search for the best suitable textposition for start and end.
341 *
342 * Case 3(a): We search the nearest rectangle consisting of some
343 * TinyTextEntity right to or bottom of the startPoint for selection 01.
344 * And, for selection 02, we have to search for right and top
345 *
346 * Case 3(b): For endpoint, we have to find the point top of or left to
347 * endpoint if we have selection 01.
348 * Otherwise, the search will be left and bottom
349 */
350
351 // we know that startC.x > endC.x, we need to decide which is top and which is bottom
352 const NormalizedRect start_end = (startC.y < endC.y) ? NormalizedRect(startC.x, startC.y, endC.x, endC.y) : NormalizedRect(startC.x, endC.y, endC.x, startC.y);
353
354 // Case 1(a)
355 if (!boundingRect.intersects(start_end)) {
356 return ret;
357 } else {
358 // case 1(b)
359 /**
360 note that, after swapping of start and end, we know that,
361 start is always left to end. but, we cannot say start is
362 positioned upper than end.
363 **/
364 // if start is left to content rect take it to content rect boundary
365 if (startC.x * scaleX < minX) {
366 startC.x = minX / scaleX;
367 }
368 if (endC.x * scaleX > maxX) {
369 endC.x = maxX / scaleX;
370 }
371
372 // if start is top to end (selection type 01)
373 if (startC.y * scaleY < minY) {
374 startC.y = minY / scaleY;
375 }
376 if (endC.y * scaleY > maxY) {
377 endC.y = maxY / scaleY;
378 }
379
380 // if start is bottom to end (selection type 02)
381 if (startC.y * scaleY > maxY) {
382 startC.y = maxY / scaleY;
383 }
384 if (endC.y * scaleY < minY) {
385 endC.y = minY / scaleY;
386 }
387 }
388
389 TextEntity::List::ConstIterator it = d->m_words.constBegin(), itEnd = d->m_words.constEnd();
390 TextEntity::List::ConstIterator start = it, end = itEnd, tmpIt = it; //, tmpItEnd = itEnd;
391 const MergeSide side = d->m_page ? (MergeSide)d->m_page->totalOrientation() : MergeRight;
392
393 NormalizedRect tmp;
394 // case 2(a)
395 for (; it != itEnd; ++it) {
396 tmp = it->area();
397 if (tmp.contains(startC.x, startC.y)) {
398 start = it;
399 }
400 if (tmp.contains(endC.x, endC.y)) {
401 end = it;
402 }
403 }
404
405 // case 2(b)
406 it = tmpIt;
407 if (start == it && end == itEnd) {
408 for (; it != itEnd; ++it) {
409 // is there any text rectangle within the start_end rect
410 tmp = it->area();
411 if (start_end.intersects(tmp)) {
412 break;
413 }
414 }
415
416 // we have searched every text entities, but none is within the rectangle created by start and end
417 // so, no selection should be done
418 if (it == itEnd) {
419 return ret;
420 }
421 }
422 it = tmpIt;
423 bool selection_two_start = false;
424
425 // case 3.a
426 if (start == it) {
427 NormalizedRect rect;
428
429 // selection type 01
430 if (startC.y <= endC.y) {
431 for (; it != itEnd; ++it) {
432 rect = it->area();
433 bool flagV = !rect.isBottom(startC);
434
435 if (flagV && rect.isRight(startC)) {
436 start = it;
437 break;
438 }
439 }
440 }
441
442 // selection type 02
443 else {
444 selection_two_start = true;
445 int distance = scaleX + scaleY + 100;
446
447 for (; it != itEnd; ++it) {
448 rect = it->area();
449
450 if (rect.isBottomOrLevel(startC) && rect.isRight(startC)) {
451 QRect entRect = rect.geometry(scaleX, scaleY);
452 int xdist, ydist;
453 xdist = entRect.center().x() - startC.x * scaleX;
454 ydist = entRect.center().y() - startC.y * scaleY;
455
456 // make them positive
457 if (xdist < 0) {
458 xdist = -xdist;
459 }
460 if (ydist < 0) {
461 ydist = -ydist;
462 }
463
464 if ((xdist + ydist) < distance) {
465 distance = xdist + ydist;
466 start = it;
467 }
468 }
469 }
470 }
471 }
472
473 // case 3.b
474 if (end == itEnd) {
475 it = tmpIt;
476 itEnd = itEnd - 1;
477
478 NormalizedRect rect;
479
480 if (startC.y <= endC.y) {
481 for (; itEnd >= it; itEnd--) {
482 rect = itEnd->area();
483 bool flagV = !rect.isTop(endC);
484
485 if (flagV && rect.isLeft(endC)) {
486 end = itEnd;
487 break;
488 }
489 }
490 }
491
492 else {
493 int distance = scaleX + scaleY + 100;
494 for (; itEnd >= it; itEnd--) {
495 rect = itEnd->area();
496
497 if (rect.isTopOrLevel(endC) && rect.isLeft(endC)) {
498 QRect entRect = rect.geometry(scaleX, scaleY);
499 int xdist, ydist;
500 xdist = entRect.center().x() - endC.x * scaleX;
501 ydist = entRect.center().y() - endC.y * scaleY;
502
503 // make them positive
504 if (xdist < 0) {
505 xdist = -xdist;
506 }
507 if (ydist < 0) {
508 ydist = -ydist;
509 }
510
511 if ((xdist + ydist) < distance) {
512 distance = xdist + ydist;
513 end = itEnd;
514 }
515 }
516 }
517 }
518 }
519
520 /* if start and end in selection 02 are in the same column, and we
521 start at an empty space we have to remove the selection of last
522 character
523 */
524 if (selection_two_start) {
525 if (start > end) {
526 start = start - 1;
527 }
528 }
529
530 // if start is less than end swap them
531 if (start > end) {
532 it = start;
533 start = end;
534 end = it;
535 }
536
537 // removes the possibility of crash, in case none of 1 to 3 is true
538 if (end == d->m_words.constEnd()) {
539 end--;
540 }
541
542 for (; start <= end; start++) {
543 ret->appendShape(start->transformedArea(matrix), side);
544 }
545
546 return ret;
547}
548
549RegularAreaRect *TextPage::findText(int searchID, const QString &query, SearchDirection direct, Qt::CaseSensitivity caseSensitivity, const RegularAreaRect *area)
550{
551 SearchDirection dir = direct;
552 // invalid search request
553 if (d->m_words.isEmpty() || query.isEmpty() || (area && area->isNull())) {
554 return nullptr;
555 }
557 int start_offset = 0;
559 const QMap<int, SearchPoint *>::const_iterator sIt = d->m_searchPoints.constFind(searchID);
560 if (sIt == d->m_searchPoints.constEnd()) {
561 // if no previous run of this search is found, then set it to start
562 // from the beginning (respecting the search direction)
563 if (dir == NextResult) {
564 dir = FromTop;
565 } else if (dir == PreviousResult) {
566 dir = FromBottom;
567 }
568 }
569 bool forward = true;
570 switch (dir) {
571 case FromTop:
572 start = d->m_words.constBegin();
573 start_offset = 0;
574 end = d->m_words.constEnd();
575 break;
576 case FromBottom:
577 start = d->m_words.constEnd();
578 start_offset = 0;
579 end = d->m_words.constBegin();
580 forward = false;
581 break;
582 case NextResult:
583 start = (*sIt)->it_end;
584 start_offset = (*sIt)->offset_end;
585 end = d->m_words.constEnd();
586 break;
587 case PreviousResult:
588 start = (*sIt)->it_begin;
589 start_offset = (*sIt)->offset_begin;
590 end = d->m_words.constBegin();
591 forward = false;
592 break;
593 };
594 RegularAreaRect *ret = nullptr;
595 const TextComparisonFunction cmpFn = caseSensitivity == Qt::CaseSensitive ? CaseSensitiveCmpFn : CaseInsensitiveCmpFn;
596 if (forward) {
597 ret = d->findTextInternalForward(searchID, query, cmpFn, start, start_offset, end);
598 } else {
599 ret = d->findTextInternalBackward(searchID, query, cmpFn, start, start_offset, end);
600 }
601 return ret;
602}
603
604// hyphenated '-' must be at the end of a word, so hyphenation means
605// we have a '-' just followed by a '\n' character
606// check if the string contains a '-' character
607// if the '-' is the last entry
608static int stringLengthAdaptedWithHyphen(const QString &str, TextEntity::List::ConstIterator it, TextEntity::List::ConstIterator textListEnd)
609{
610 const int len = str.length();
611
612 // hyphenated '-' must be at the end of a word, so hyphenation means
613 // we have a '-' just followed by a '\n' character
614 // check if the string contains a '-' character
615 // if the '-' is the last entry
616 if (str.endsWith(QLatin1Char('-'))) {
617 // validity chek of it + 1
618 if ((it + 1) != textListEnd) {
619 // 1. if the next character is '\n'
620 const QString &lookahedStr = (it + 1)->text();
621 if (lookahedStr.startsWith(QLatin1Char('\n'))) {
622 return len - 1;
623 }
624
625 // 2. if the next word is in a different line or not
626 const NormalizedRect &hyphenArea = it->area();
627 const NormalizedRect &lookaheadArea = (it + 1)->area();
628
629 // lookahead to check whether both the '-' rect and next character rect overlap
630 if (!doesConsumeY(hyphenArea, lookaheadArea, 70)) {
631 return len - 1;
632 }
633 }
634 }
635 // else if it is the second last entry - for example in pdf format
636 else if (str.endsWith(QLatin1String("-\n"))) {
637 return len - 2;
638 }
639
640 return len;
641}
642
643RegularAreaRect *TextPagePrivate::searchPointToArea(const SearchPoint *sp)
644{
645 const PagePrivate *pagePrivate = PagePrivate::get(m_page);
646 const QTransform matrix = pagePrivate ? pagePrivate->rotationMatrix() : QTransform();
648
649 for (TextEntity::List::ConstIterator it = sp->it_begin;; it++) {
650 const TextEntity &curEntity = *it;
651 ret->append(curEntity.transformedArea(matrix));
652
653 if (it == sp->it_end) {
654 break;
655 }
656 }
657
658 ret->simplify();
659 return ret;
660}
661
662RegularAreaRect *TextPagePrivate::findTextInternalForward(int searchID, const QString &_query, TextComparisonFunction comparer, TextEntity::List::ConstIterator start, int start_offset, TextEntity::List::ConstIterator end)
663{
664 // normalize query search all unicode (including glyphs)
665 // Use NFKC for search operations. Use NFC for copy, makeWord, and export operations.
667
668 // j is the current position in our query
669 // queryLeft is the length of the query we have left to match
670 int j = 0, queryLeft = query.length();
671
673 int offset = start_offset;
674
676 int offset_begin = 0; // dummy initial value to suppress compiler warnings
677
678 while (it != end) {
679 const TextEntity &curEntity = *it;
680 const QString &str = curEntity.text().normalized(QString::NormalizationForm_KC);
681 const int strLen = str.length();
682 const int adjustedLen = stringLengthAdaptedWithHyphen(str, it, m_words.constEnd());
683 // adjustedLen <= strLen
684
685 if (offset >= strLen) {
686 it++;
687 offset = 0;
688 continue;
689 }
690
691 if (it_begin == TextEntity::List::ConstIterator()) {
692 it_begin = it;
693 offset_begin = offset;
694 }
695
696 // Let the user write the hyphen or not when searching for text
697 int matchedLen = -1;
698 for (int matchingLen = strLen; matchingLen >= adjustedLen; matchingLen--) {
699 // we have equal (or less than) area of the query left as the length of the current
700 // entity
701 const int min = qMin(queryLeft, matchingLen - offset);
702 if (comparer(QStringView {str}.mid(offset, min), QStringView {query}.mid(j, min))) {
703 matchedLen = min;
704 break;
705 }
706 }
707
708 if (matchedLen == -1) {
709 // we have not matched
710 // this means we do not have a complete match
711 // we need to get back to query start
712 // and continue the search from this place
713#ifdef DEBUG_TEXTPAGE
714 qCDebug(OkularCoreDebug) << "\tnot matched";
715#endif
716 j = 0;
717 queryLeft = query.length();
718 it = it_begin;
719 offset = offset_begin + 1;
721 } else {
722 // we have a match
723 // move the current position in the query
724 // to the position after the length of this string
725 // we matched
726 // subtract the length of the current entity from
727 // the left length of the query
728#ifdef DEBUG_TEXTPAGE
729 qCDebug(OkularCoreDebug) << "\tmatched" << matchedLen;
730#endif
731 j += matchedLen;
732 queryLeft -= matchedLen;
733
734 if (queryLeft == 0) {
735 // save or update the search point for the current searchID
736 QMap<int, SearchPoint *>::iterator sIt = m_searchPoints.find(searchID);
737 if (sIt == m_searchPoints.end()) {
738 sIt = m_searchPoints.insert(searchID, new SearchPoint);
739 }
740 SearchPoint *sp = *sIt;
741 sp->it_begin = it_begin;
742 sp->it_end = it;
743 sp->offset_begin = offset_begin;
744 sp->offset_end = offset + matchedLen;
745 return searchPointToArea(sp);
746 }
747
748 it++;
749 offset = 0;
750 }
751 }
752 // end of loop - it means that we've ended the textentities
753
754 const QMap<int, SearchPoint *>::iterator sIt = m_searchPoints.find(searchID);
755 if (sIt != m_searchPoints.end()) {
756 SearchPoint *sp = *sIt;
757 m_searchPoints.erase(sIt);
758 delete sp;
759 }
760 return nullptr;
761}
762
763RegularAreaRect *TextPagePrivate::findTextInternalBackward(int searchID, const QString &_query, TextComparisonFunction comparer, TextEntity::List::ConstIterator start, int start_offset, TextEntity::List::ConstIterator end)
764{
765 // normalize query to search all unicode (including glyphs)
766 // Use NFKC for search operations. Use NFC for copy, makeWord, and export operations.
768
769 // j is the current position in our query
770 // len is the length of the string in TextEntity
771 // queryLeft is the length of the query we have left
772 int j = query.length(), queryLeft = query.length();
773
775 int offset = start_offset;
776
778 int offset_begin = 0; // dummy initial value to suppress compiler warnings
779
780 while (true) {
781 if (offset <= 0) {
782 if (it == end) {
783 break;
784 }
785 it--;
786 }
787
788 const TextEntity &curEntity = *it;
789 const QString &str = curEntity.text().normalized(QString::NormalizationForm_KC);
790 const int strLen = str.length();
791 const int adjustedLen = stringLengthAdaptedWithHyphen(str, it, m_words.constEnd());
792 // adjustedLen <= strLen
793
794 if (offset <= 0) {
795 offset = strLen;
796 }
797
798 if (it_begin == TextEntity::List::ConstIterator()) {
799 it_begin = it;
800 offset_begin = offset;
801 }
802
803 // Let the user write the hyphen or not when searching for text
804 int matchedLen = -1;
805 // we have equal (or less than) area of the query left as the length of the current
806 // entity
807 for (int matchingLen = strLen; matchingLen >= adjustedLen; matchingLen--) {
808 const int hyphenOffset = (strLen - matchingLen);
809 const int min = qMin(queryLeft + hyphenOffset, offset);
810 if (comparer(QStringView {str}.mid(offset - min, min - hyphenOffset), QStringView {query}.mid(j - min + hyphenOffset, min - hyphenOffset))) {
811 matchedLen = min - hyphenOffset;
812 break;
813 }
814 }
815
816 if (matchedLen == -1) {
817 // we have not matched
818 // this means we do not have a complete match
819 // we need to get back to query start
820 // and continue the search from this place
821#ifdef DEBUG_TEXTPAGE
822 qCDebug(OkularCoreDebug) << "\tnot matched";
823#endif
824
825 j = query.length();
826 queryLeft = query.length();
827 it = it_begin;
828 offset = offset_begin - 1;
830 } else {
831 // we have a match
832 // move the current position in the query
833 // to the position after the length of this string
834 // we matched
835 // subtract the length of the current entity from
836 // the left length of the query
837#ifdef DEBUG_TEXTPAGE
838 qCDebug(OkularCoreDebug) << "\tmatched";
839#endif
840 j -= matchedLen;
841 queryLeft -= matchedLen;
842
843 if (queryLeft == 0) {
844 // save or update the search point for the current searchID
845 QMap<int, SearchPoint *>::iterator sIt = m_searchPoints.find(searchID);
846 if (sIt == m_searchPoints.end()) {
847 sIt = m_searchPoints.insert(searchID, new SearchPoint);
848 }
849 SearchPoint *sp = *sIt;
850 sp->it_begin = it;
851 sp->it_end = it_begin;
852 sp->offset_begin = offset - matchedLen;
853 sp->offset_end = offset_begin;
854 return searchPointToArea(sp);
855 }
856
857 offset = 0;
858 }
859 }
860 // end of loop - it means that we've ended the textentities
861
862 const QMap<int, SearchPoint *>::iterator sIt = m_searchPoints.find(searchID);
863 if (sIt != m_searchPoints.end()) {
864 SearchPoint *sp = *sIt;
865 m_searchPoints.erase(sIt);
866 delete sp;
867 }
868 return nullptr;
869}
870
872{
874}
875
877{
878 if (area && area->isNull()) {
879 return QString();
880 }
881
882 TextEntity::List::ConstIterator it = d->m_words.constBegin(), itEnd = d->m_words.constEnd();
883 QString ret;
884 if (area) {
885 for (; it != itEnd; ++it) {
887 if (area->intersects(it->area())) {
888 ret += it->text();
889 }
890 } else {
891 NormalizedPoint center = it->area().center();
892 if (area->contains(center.x, center.y)) {
893 ret += it->text();
894 }
895 }
896 }
897 } else {
898 for (; it != itEnd; ++it) {
899 ret += it->text();
900 }
901 }
902 return ret;
903}
904
905static bool compareTinyTextEntityX(const WordWithCharacters &first, const WordWithCharacters &second)
906{
907 QRect firstArea = first.area().roundedGeometry(1000, 1000);
908 QRect secondArea = second.area().roundedGeometry(1000, 1000);
909
910 return firstArea.left() < secondArea.left();
911}
912
913static bool compareTinyTextEntityY(const WordWithCharacters &first, const WordWithCharacters &second)
914{
915 const QRect firstArea = first.area().roundedGeometry(1000, 1000);
916 const QRect secondArea = second.area().roundedGeometry(1000, 1000);
917
918 return firstArea.top() < secondArea.top();
919}
920
921/**
922 * Sets a new world list. Deleting the contents of the old one
923 */
924void TextPagePrivate::setWordList(const TextEntity::List &list)
925{
926 m_words = list;
927}
928
929/**
930 * Remove all the spaces in between texts. It will make all the generators
931 * same, whether they save spaces(like pdf) or not(like djvu).
932 */
933static void removeSpace(TextEntity::List *words)
934{
935 TextEntity::List::Iterator it = words->begin();
936 const QString str(QLatin1Char(' '));
937
938 while (it != words->end()) {
939 if (it->text() == str) {
940 it = words->erase(it);
941 } else {
942 ++it;
943 }
944 }
945}
946
947/**
948 * We will read the TinyTextEntity from characters and try to create words from there.
949 * Note: characters might be already characters for some generators, but we will keep
950 * the nomenclature characters for the generator produced data. The resulting
951 * WordsWithCharacters memory has to be managed by the caller, both the
952 * WordWithCharacters::word and WordWithCharacters::characters contents
953 */
954static WordsWithCharacters makeWordFromCharacters(const TextEntity::List &characters, int pageWidth, int pageHeight)
955{
956 /**
957 * We will traverse characters and try to create words from the TinyTextEntities in it.
958 * We will search TinyTextEntity blocks and merge them until we get a
959 * space between two consecutive TinyTextEntities. When we get a space
960 * we can take it as a end of word. Then we store the word as a TinyTextEntity
961 * and keep it in newList.
962
963 * We create a RegionText named regionWord that contains the word and the characters associated with it and
964 * a rectangle area of the element in newList.
965
966 */
967 WordsWithCharacters wordsWithCharacters;
968
969 TextEntity::List::ConstIterator it = characters.begin(), itEnd = characters.end(), tmpIt;
970 int newLeft, newRight, newTop, newBottom;
971
972 for (; it != itEnd; it++) {
973 QString textString = it->text();
974 QString newString;
975 QRect lineArea = it->area().roundedGeometry(pageWidth, pageHeight);
976 QRect elementArea;
977 TextEntity::List wordCharacters;
978 tmpIt = it;
979 int space = 0;
980
981 while (!space) {
982 if (!textString.isEmpty()) {
983 newString.append(textString);
984
985 // when textString is the start of the word
986 if (tmpIt == it) {
987 NormalizedRect newRect(lineArea, pageWidth, pageHeight);
988 wordCharacters.append(TextEntity(textString.normalized(QString::NormalizationForm_C), newRect));
989 } else {
990 NormalizedRect newRect(elementArea, pageWidth, pageHeight);
991 wordCharacters.append(TextEntity(textString.normalized(QString::NormalizationForm_C), newRect));
992 }
993 }
994
995 ++it;
996
997 /*
998 we must have to put this line before the if condition of it==itEnd
999 otherwise the last character can be missed
1000 */
1001 if (it == itEnd) {
1002 break;
1003 }
1004 elementArea = it->area().roundedGeometry(pageWidth, pageHeight);
1005 if (!doesConsumeY(elementArea, lineArea, 60)) {
1006 --it;
1007 break;
1008 }
1009
1010 const int text_y1 = elementArea.top(), text_x1 = elementArea.left(), text_y2 = elementArea.y() + elementArea.height(), text_x2 = elementArea.x() + elementArea.width();
1011 const int line_y1 = lineArea.top(), line_x1 = lineArea.left(), line_y2 = lineArea.y() + lineArea.height(), line_x2 = lineArea.x() + lineArea.width();
1012
1013 space = elementArea.left() - lineArea.right();
1014
1015 if (space != 0) {
1016 it--;
1017 break;
1018 }
1019
1020 newLeft = text_x1 < line_x1 ? text_x1 : line_x1;
1021 newRight = line_x2 > text_x2 ? line_x2 : text_x2;
1022 newTop = text_y1 > line_y1 ? line_y1 : text_y1;
1023 newBottom = text_y2 > line_y2 ? text_y2 : line_y2;
1024
1025 lineArea.setLeft(newLeft);
1026 lineArea.setTop(newTop);
1027 lineArea.setWidth(newRight - newLeft);
1028 lineArea.setHeight(newBottom - newTop);
1029
1030 textString = it->text();
1031 }
1032
1033 // if newString is not empty, save it
1034 if (!newString.isEmpty()) {
1035 const NormalizedRect newRect(lineArea, pageWidth, pageHeight);
1037 wordsWithCharacters.append(WordWithCharacters(word, wordCharacters));
1038 }
1039
1040 if (it == itEnd) {
1041 break;
1042 }
1043 }
1044
1045 wordsWithCharacters.shrink_to_fit();
1046 return wordsWithCharacters;
1047}
1048
1049/**
1050 * Create Lines from the words and sort them
1051 */
1052QList<QPair<WordsWithCharacters, QRect>> makeAndSortLines(const WordsWithCharacters &wordsTmp, int pageWidth, int pageHeight)
1053{
1054 /**
1055 * We cannot assume that the generator will give us texts in the right order.
1056 * We can only assume that we will get texts in the page and their bounding
1057 * rectangle. The texts can be character, word, half-word anything.
1058 * So, we need to:
1059 **
1060 * 1. Sort rectangles/boxes containing texts by y0(top)
1061 * 2. Create textline where there is y overlap between TinyTextEntity 's
1062 * 3. Within each line sort the TinyTextEntity 's by x0(left)
1063 */
1064
1066
1067 /*
1068 Make a new copy of the TextList in the words, so that the wordsTmp and lines do
1069 not contain same pointers for all the TinyTextEntity.
1070 */
1071 QList<WordWithCharacters> words = wordsTmp;
1072
1073 // Step 1
1074 std::sort(words.begin(), words.end(), compareTinyTextEntityY);
1075
1076 // Step 2
1077 QList<WordWithCharacters>::Iterator it = words.begin(), itEnd = words.end();
1078
1079 // for every non-space texts(characters/words) in the textList
1080 for (; it != itEnd; it++) {
1081 const QRect elementArea = (*it).area().roundedGeometry(pageWidth, pageHeight);
1082 bool found = false;
1083
1084 for (QPair<WordsWithCharacters, QRect> &linesI : lines) {
1085 /* the line area which will be expanded
1086 line_rects is only necessary to preserve the topmin and bottommax of all
1087 the texts in the line, left and right is not necessary at all
1088 */
1089 QRect &lineArea = linesI.second;
1090 const int text_y1 = elementArea.top(), text_y2 = elementArea.top() + elementArea.height(), text_x1 = elementArea.left(), text_x2 = elementArea.left() + elementArea.width();
1091 const int line_y1 = lineArea.top(), line_y2 = lineArea.top() + lineArea.height(), line_x1 = lineArea.left(), line_x2 = lineArea.left() + lineArea.width();
1092
1093 /*
1094 if the new text and the line has y overlapping parts of more than 70%,
1095 the text will be added to this line
1096 */
1097 if (doesConsumeY(elementArea, lineArea, 70)) {
1098 WordsWithCharacters &line = linesI.first;
1099 line.append(*it);
1100
1101 const int newLeft = line_x1 < text_x1 ? line_x1 : text_x1;
1102 const int newRight = line_x2 > text_x2 ? line_x2 : text_x2;
1103 const int newTop = line_y1 < text_y1 ? line_y1 : text_y1;
1104 const int newBottom = text_y2 > line_y2 ? text_y2 : line_y2;
1105
1106 lineArea = QRect(newLeft, newTop, newRight - newLeft, newBottom - newTop);
1107 found = true;
1108 }
1109
1110 if (found) {
1111 break;
1112 }
1113 }
1114
1115 /* when we have found a new line create a new TextList containing
1116 only one element and append it to the lines
1117 */
1118 if (!found) {
1119 lines.append(QPair<WordsWithCharacters, QRect>({*it}, elementArea));
1120 }
1121 }
1122
1123 // Step 3
1124 for (QPair<WordsWithCharacters, QRect> &line : lines) {
1126 std::sort(list.begin(), list.end(), compareTinyTextEntityX);
1127 }
1128 lines.shrink_to_fit();
1129
1130 return lines;
1131}
1132
1133/**
1134 * Calculate Statistical information from the lines we made previously
1135 */
1136static void calculateStatisticalInformation(const QList<WordWithCharacters> &words, int pageWidth, int pageHeight, int *word_spacing, int *line_spacing, int *col_spacing)
1137{
1138 /**
1139 * For the region, defined by line_rects and lines
1140 * 1. Make line statistical analysis to find the line spacing
1141 * 2. Make character statistical analysis to differentiate between
1142 * word spacing and column spacing.
1143 */
1144
1145 /**
1146 * Step 0
1147 */
1148 const QList<QPair<WordsWithCharacters, QRect>> sortedLines = makeAndSortLines(words, pageWidth, pageHeight);
1149
1150 /**
1151 * Step 1
1152 */
1153 QMap<int, int> line_space_stat;
1154 for (int i = 0; i < sortedLines.length(); i++) {
1155 const QRect rectUpper = sortedLines.at(i).second;
1156
1157 if (i + 1 == sortedLines.length()) {
1158 break;
1159 }
1160 const QRect rectLower = sortedLines.at(i + 1).second;
1161
1162 int linespace = rectLower.top() - (rectUpper.top() + rectUpper.height());
1163 if (linespace < 0) {
1164 linespace = -linespace;
1165 }
1166
1167 if (line_space_stat.contains(linespace)) {
1168 line_space_stat[linespace]++;
1169 } else {
1170 line_space_stat[linespace] = 1;
1171 }
1172 }
1173
1174 *line_spacing = 0;
1175 int weighted_count = 0;
1176 QMapIterator<int, int> iterate_linespace(line_space_stat);
1177
1178 while (iterate_linespace.hasNext()) {
1179 iterate_linespace.next();
1180 *line_spacing += iterate_linespace.value() * iterate_linespace.key();
1181 weighted_count += iterate_linespace.value();
1182 }
1183 if (*line_spacing != 0) {
1184 *line_spacing = (int)((double)*line_spacing / (double)weighted_count + 0.5);
1185 }
1186
1187 /**
1188 * Step 2
1189 */
1190 // We would like to use QMap instead of QHash as it will keep the keys sorted
1191 QMap<int, int> hor_space_stat;
1192 QMap<int, int> col_space_stat;
1193 QList<QList<QRect>> space_rects;
1194 QVector<QRect> max_hor_space_rects;
1195
1196 // Space in every line
1197 for (const QPair<WordsWithCharacters, QRect> &sortedLine : sortedLines) {
1198 const WordsWithCharacters list = sortedLine.first;
1199 QList<QRect> line_space_rects;
1200 int maxSpace = 0, minSpace = pageWidth;
1201
1202 // for every TinyTextEntity element in the line
1204 QRect max_area1, max_area2;
1205 // for every line
1206 for (; it != itEnd; it++) {
1207 const QRect area1 = (*it).area().roundedGeometry(pageWidth, pageHeight);
1208 if (it + 1 == itEnd) {
1209 break;
1210 }
1211
1212 const QRect area2 = (*(it + 1)).area().roundedGeometry(pageWidth, pageHeight);
1213 int space = area2.left() - area1.right();
1214
1215 if (space > maxSpace) {
1216 max_area1 = area1;
1217 max_area2 = area2;
1218 maxSpace = space;
1219 }
1220
1221 if (space < minSpace && space != 0) {
1222 minSpace = space;
1223 }
1224
1225 // if we found a real space, whose length is not zero and also less than the pageWidth
1226 if (space != 0 && space != pageWidth) {
1227 // increase the count of the space amount
1228 if (hor_space_stat.contains(space)) {
1229 hor_space_stat[space]++;
1230 } else {
1231 hor_space_stat[space] = 1;
1232 }
1233
1234 int left, right, top, bottom;
1235
1236 left = area1.right();
1237 right = area2.left();
1238
1239 top = area2.top() < area1.top() ? area2.top() : area1.top();
1240 bottom = area2.bottom() > area1.bottom() ? area2.bottom() : area1.bottom();
1241
1242 QRect rect(left, top, right - left, bottom - top);
1243 line_space_rects.append(rect);
1244 }
1245 }
1246
1247 space_rects.append(line_space_rects);
1248
1249 if (hor_space_stat.contains(maxSpace)) {
1250 if (hor_space_stat[maxSpace] != 1) {
1251 hor_space_stat[maxSpace]--;
1252 } else {
1253 hor_space_stat.remove(maxSpace);
1254 }
1255 }
1256
1257 if (maxSpace != 0) {
1258 if (col_space_stat.contains(maxSpace)) {
1259 col_space_stat[maxSpace]++;
1260 } else {
1261 col_space_stat[maxSpace] = 1;
1262 }
1263
1264 // store the max rect of each line
1265 const int left = max_area1.right();
1266 const int right = max_area2.left();
1267 const int top = (max_area1.top() > max_area2.top()) ? max_area2.top() : max_area1.top();
1268 const int bottom = (max_area1.bottom() < max_area2.bottom()) ? max_area2.bottom() : max_area1.bottom();
1269
1270 const QRect rect(left, top, right - left, bottom - top);
1271 max_hor_space_rects.append(rect);
1272 } else {
1273 max_hor_space_rects.append(QRect(0, 0, 0, 0));
1274 }
1275 }
1276
1277 // All the between word space counts are in hor_space_stat
1278 *word_spacing = 0;
1279 weighted_count = 0;
1280 QMapIterator<int, int> iterate(hor_space_stat);
1281
1282 while (iterate.hasNext()) {
1283 iterate.next();
1284
1285 if (iterate.key() > 0) {
1286 *word_spacing += iterate.value() * iterate.key();
1287 weighted_count += iterate.value();
1288 }
1289 }
1290 if (weighted_count) {
1291 *word_spacing = (int)((double)*word_spacing / (double)weighted_count + 0.5);
1292 }
1293
1294 *col_spacing = 0;
1295 QMapIterator<int, int> iterate_col(col_space_stat);
1296
1297 while (iterate_col.hasNext()) {
1298 iterate_col.next();
1299 if (iterate_col.value() > *col_spacing) {
1300 *col_spacing = iterate_col.value();
1301 }
1302 }
1303 *col_spacing = col_space_stat.key(*col_spacing);
1304
1305 // if there is just one line in a region, there is no point in dividing it
1306 if (sortedLines.length() == 1) {
1307 *word_spacing = *col_spacing;
1308 }
1309}
1310
1311/**
1312 * Implements the XY Cut algorithm for textpage segmentation
1313 * The resulting RegionTextList will contain RegionText whose WordsWithCharacters::word and
1314 * WordsWithCharacters::characters are reused from wordsWithCharacters (i.e. no new nor delete happens in this function)
1315 */
1316static RegionTextList XYCutForBoundingBoxes(const QList<WordWithCharacters> &wordsWithCharacters, int pageWidth, int pageHeight)
1317{
1318 RegionTextList tree;
1319 QRect contentRect(0, 0, pageWidth, pageHeight);
1320 const RegionText root(wordsWithCharacters, contentRect);
1321
1322 // start the tree with the root, it is our only region at the start
1323 tree.push_back(root);
1324
1325 int i = 0;
1326
1327 // while traversing the tree has not been ended
1328 while (i < tree.length()) {
1329 const RegionText node = tree.at(i);
1330 QRect regionRect = node.area();
1331
1332 /**
1333 * 1. calculation of projection profiles
1334 */
1335 // allocate the size of proj profiles and initialize with 0
1336 int size_proj_y = node.area().height();
1337 int size_proj_x = node.area().width();
1338 // dynamic memory allocation
1339 QVarLengthArray<int> proj_on_xaxis(size_proj_x);
1340 QVarLengthArray<int> proj_on_yaxis(size_proj_y);
1341
1342 for (int j = 0; j < size_proj_y; ++j) {
1343 proj_on_yaxis[j] = 0;
1344 }
1345 for (int j = 0; j < size_proj_x; ++j) {
1346 proj_on_xaxis[j] = 0;
1347 }
1348
1349 const QList<WordWithCharacters> list = node.text();
1350
1351 // Calculate tcx and tcy locally for each new region
1352 int word_spacing, line_spacing, column_spacing;
1353 calculateStatisticalInformation(list, pageWidth, pageHeight, &word_spacing, &line_spacing, &column_spacing);
1354
1355 const int tcx = word_spacing * 2;
1356 const int tcy = line_spacing * 2;
1357
1358 int maxX = 0, maxY = 0;
1359 int avgX = 0;
1360 int count;
1361
1362 // for every text in the region
1363 for (const WordWithCharacters &wwc : list) {
1364 const QRect entRect = wwc.area().geometry(pageWidth, pageHeight);
1365
1366 // calculate vertical projection profile proj_on_xaxis1
1367 for (int k = entRect.left(); k <= entRect.left() + entRect.width(); ++k) {
1368 if ((k - regionRect.left()) < size_proj_x && (k - regionRect.left()) >= 0) {
1369 proj_on_xaxis[k - regionRect.left()] += entRect.height();
1370 }
1371 }
1372
1373 // calculate horizontal projection profile in the same way
1374 for (int k = entRect.top(); k <= entRect.top() + entRect.height(); ++k) {
1375 if ((k - regionRect.top()) < size_proj_y && (k - regionRect.top()) >= 0) {
1376 proj_on_yaxis[k - regionRect.top()] += entRect.width();
1377 }
1378 }
1379 }
1380
1381 for (int j = 0; j < size_proj_y; ++j) {
1382 if (proj_on_yaxis[j] > maxY) {
1383 maxY = proj_on_yaxis[j];
1384 }
1385 }
1386
1387 avgX = count = 0;
1388 for (int j = 0; j < size_proj_x; ++j) {
1389 if (proj_on_xaxis[j] > maxX) {
1390 maxX = proj_on_xaxis[j];
1391 }
1392 if (proj_on_xaxis[j]) {
1393 count++;
1394 avgX += proj_on_xaxis[j];
1395 }
1396 }
1397 if (count) {
1398 avgX /= count;
1399 }
1400
1401 /**
1402 * 2. Cleanup Boundary White Spaces and removal of noise
1403 */
1404 int xbegin = 0, xend = size_proj_x - 1;
1405 int ybegin = 0, yend = size_proj_y - 1;
1406 while (xbegin < size_proj_x && proj_on_xaxis[xbegin] <= 0) {
1407 xbegin++;
1408 }
1409 while (xend >= 0 && proj_on_xaxis[xend] <= 0) {
1410 xend--;
1411 }
1412 while (ybegin < size_proj_y && proj_on_yaxis[ybegin] <= 0) {
1413 ybegin++;
1414 }
1415 while (yend >= 0 && proj_on_yaxis[yend] <= 0) {
1416 yend--;
1417 }
1418
1419 // update the regionRect
1420 int old_left = regionRect.left(), old_top = regionRect.top();
1421 regionRect.setLeft(old_left + xbegin);
1422 regionRect.setRight(old_left + xend);
1423 regionRect.setTop(old_top + ybegin);
1424 regionRect.setBottom(old_top + yend);
1425
1426 int tnx = (int)((double)avgX * 10.0 / 100.0 + 0.5), tny = 0;
1427 for (int j = 0; j < size_proj_x; ++j) {
1428 proj_on_xaxis[j] -= tnx;
1429 }
1430 for (int j = 0; j < size_proj_y; ++j) {
1431 proj_on_yaxis[j] -= tny;
1432 }
1433
1434 /**
1435 * 3. Find the Widest gap
1436 */
1437 int gap_hor = -1, pos_hor = -1;
1438 int begin = -1, end = -1;
1439
1440 // find all hor_gaps and find the maximum between them
1441 for (int j = 1; j < size_proj_y; ++j) {
1442 // transition from white to black
1443 if (begin >= 0 && proj_on_yaxis[j - 1] <= 0 && proj_on_yaxis[j] > 0) {
1444 end = j;
1445 }
1446
1447 // transition from black to white
1448 if (proj_on_yaxis[j - 1] > 0 && proj_on_yaxis[j] <= 0) {
1449 begin = j;
1450 }
1451
1452 if (begin > 0 && end > 0 && end - begin > gap_hor) {
1453 gap_hor = end - begin;
1454 pos_hor = (end + begin) / 2;
1455 begin = -1;
1456 end = -1;
1457 }
1458 }
1459
1460 begin = -1, end = -1;
1461 int gap_ver = -1, pos_ver = -1;
1462
1463 // find all the ver_gaps and find the maximum between them
1464 for (int j = 1; j < size_proj_x; ++j) {
1465 // transition from white to black
1466 if (begin >= 0 && proj_on_xaxis[j - 1] <= 0 && proj_on_xaxis[j] > 0) {
1467 end = j;
1468 }
1469
1470 // transition from black to white
1471 if (proj_on_xaxis[j - 1] > 0 && proj_on_xaxis[j] <= 0) {
1472 begin = j;
1473 }
1474
1475 if (begin > 0 && end > 0 && end - begin > gap_ver) {
1476 gap_ver = end - begin;
1477 pos_ver = (end + begin) / 2;
1478 begin = -1;
1479 end = -1;
1480 }
1481 }
1482
1483 int cut_pos_x = pos_ver, cut_pos_y = pos_hor;
1484 int gap_x = gap_ver, gap_y = gap_hor;
1485
1486 /**
1487 * 4. Cut the region and make nodes (left,right) or (up,down)
1488 */
1489 bool cut_hor = false, cut_ver = false;
1490
1491 // For horizontal cut
1492 const int topHeight = cut_pos_y - (regionRect.top() - old_top);
1493 const QRect topRect(regionRect.left(), regionRect.top(), regionRect.width(), topHeight);
1494 const QRect bottomRect(regionRect.left(), regionRect.top() + topHeight, regionRect.width(), regionRect.height() - topHeight);
1495
1496 // For vertical Cut
1497 const int leftWidth = cut_pos_x - (regionRect.left() - old_left);
1498 const QRect leftRect(regionRect.left(), regionRect.top(), leftWidth, regionRect.height());
1499 const QRect rightRect(regionRect.left() + leftWidth, regionRect.top(), regionRect.width() - leftWidth, regionRect.height());
1500
1501 if (gap_y >= gap_x && gap_y >= tcy) {
1502 cut_hor = true;
1503 } else if (gap_y >= gap_x && gap_y <= tcy && gap_x >= tcx) {
1504 cut_ver = true;
1505 } else if (gap_x >= gap_y && gap_x >= tcx) {
1506 cut_ver = true;
1507 } else if (gap_x >= gap_y && gap_x <= tcx && gap_y >= tcy) {
1508 cut_hor = true;
1509 } else {
1510 // no cut possible
1511 // we can now update the node rectangle with the shrinked rectangle
1512 RegionText tmpNode = tree.at(i);
1513 tmpNode.setArea(regionRect);
1514 tree.replace(i, tmpNode);
1515 i++;
1516 continue;
1517 }
1518
1519 WordsWithCharacters list1, list2;
1520
1521 // horizontal cut, topRect and bottomRect
1522 if (cut_hor) {
1523 for (const WordWithCharacters &word : list) {
1524 const QRect wordRect = word.area().geometry(pageWidth, pageHeight);
1525
1526 if (topRect.intersects(wordRect)) {
1527 list1.append(word);
1528 } else {
1529 list2.append(word);
1530 }
1531 }
1532
1533 RegionText node1(list1, topRect);
1534 RegionText node2(list2, bottomRect);
1535
1536 tree.replace(i, node1);
1537 tree.insert(i + 1, node2);
1538 }
1539
1540 // vertical cut, leftRect and rightRect
1541 else if (cut_ver) {
1542 for (const WordWithCharacters &word : list) {
1543 const QRect wordRect = word.area().geometry(pageWidth, pageHeight);
1544
1545 if (leftRect.intersects(wordRect)) {
1546 list1.append(word);
1547 } else {
1548 list2.append(word);
1549 }
1550 }
1551
1552 RegionText node1(list1, leftRect);
1553 RegionText node2(list2, rightRect);
1554
1555 tree.replace(i, node1);
1556 tree.insert(i + 1, node2);
1557 }
1558 }
1559
1560 tree.shrink_to_fit();
1561 return tree;
1562}
1563
1564/**
1565 * Add spaces in between words in a line. It reuses the pointers passed in tree and might add new ones. You will need to take care of deleting them if needed
1566 */
1567TextEntity::List addNecessarySpace(RegionTextList tree, int pageWidth, int pageHeight)
1568{
1569 /**
1570 * 1. Call makeAndSortLines before adding spaces in between words in a line
1571 * 2. Now add spaces between every two words in a line
1572 * 3. Finally, extract all the space separated texts from each region and return it
1573 */
1574
1575 TextEntity::List res;
1576 // Only change the texts under RegionTexts, not the area
1577 for (const RegionText &tmpRegion : std::as_const(tree)) {
1578 // Step 01
1579 QList<QPair<WordsWithCharacters, QRect>> sortedLines = makeAndSortLines(tmpRegion.text(), pageWidth, pageHeight);
1580 int counter = 0;
1581
1582 // Step 02
1583 for (QPair<WordsWithCharacters, QRect> &sortedLine : sortedLines) {
1584 WordsWithCharacters &list = sortedLine.first;
1585 for (int k = 0; k < list.length(); k++) {
1586 const QRect area1 = list.at(k).area().roundedGeometry(pageWidth, pageHeight);
1587 if (k + 1 >= list.length()) {
1588 break;
1589 }
1590
1591 const QRect area2 = list.at(k + 1).area().roundedGeometry(pageWidth, pageHeight);
1592 const int space = area2.left() - area1.right();
1593
1594 if (space != 0) {
1595 // Make a TinyTextEntity of string space and push it between it and it+1
1596 const int left = area1.right();
1597 const int right = area2.left();
1598 const int top = area2.top() < area1.top() ? area2.top() : area1.top();
1599 const int bottom = area2.bottom() > area1.bottom() ? area2.bottom() : area1.bottom();
1600
1601 const QString spaceStr(QStringLiteral(" "));
1602 const QRect rect(QPoint(left, top), QPoint(right, bottom));
1603 const NormalizedRect entRect(rect, pageWidth, pageHeight);
1604 TextEntity ent1 = TextEntity(spaceStr, entRect);
1605 WordWithCharacters word(ent1, QList<TextEntity>() << ent1);
1606
1607 list.insert(k + 1, word);
1608
1609 // Skip the space
1610 k++;
1611 }
1612 }
1613 counter += list.length();
1614 }
1615 res.reserve(res.length() + counter);
1616
1617 for (const QPair<WordsWithCharacters, QRect> &sortedLine : std::as_const(sortedLines)) {
1618 for (const WordWithCharacters &word : sortedLine.first) {
1619 res += word.characters;
1620 }
1621 }
1622 }
1623
1624 // Step 03
1625 res.shrink_to_fit();
1626 return res;
1627}
1628
1629/**
1630 * Correct the textOrder, all layout recognition works here
1631 */
1632void TextPagePrivate::correctTextOrder()
1633{
1634 // m_page->width() and m_page->height() are in pixels at
1635 // 100% zoom level, and thus depend on display DPI.
1636 // To avoid Okular failing on lowDPI displays,
1637 // we scale pageWidth and pageHeight so their sum equals 2000.
1638 const double scalingFactor = 2000.0 / (m_page->width() + m_page->height());
1639 const int pageWidth = (int)(scalingFactor * m_page->width());
1640 const int pageHeight = (int)(scalingFactor * m_page->height());
1641
1642 TextEntity::List characters = m_words;
1643
1644 /**
1645 * Remove spaces from the text
1646 */
1647 removeSpace(&characters);
1648
1649 /**
1650 * Construct words from characters
1651 */
1652 const QList<WordWithCharacters> wordsWithCharacters = makeWordFromCharacters(characters, pageWidth, pageHeight);
1653
1654 /**
1655 * Make a XY Cut tree for segmentation of the texts
1656 */
1657 RegionTextList tree = XYCutForBoundingBoxes(wordsWithCharacters, pageWidth, pageHeight);
1658
1659 /**
1660 * Add spaces to the word
1661 */
1662 const auto listOfCharacters = addNecessarySpace(std::move(tree), pageWidth, pageHeight);
1663
1664 setWordList(listOfCharacters);
1665}
1666
1668{
1669 if (area && area->isNull()) {
1670 return TextEntity::List();
1671 }
1672
1673 TextEntity::List ret;
1674 if (area) {
1675 for (const TextEntity &te : std::as_const(d->m_words)) {
1677 if (area->intersects(te.area())) {
1678 ret.append(te);
1679 }
1680 } else {
1681 const NormalizedPoint center = te.area().center();
1682 if (area->contains(center.x, center.y)) {
1683 ret.append(te);
1684 }
1685 }
1686 }
1687 } else {
1688 for (const TextEntity &te : std::as_const(d->m_words)) {
1689 ret.append(te);
1690 }
1691 }
1692 return ret;
1693}
1694
1695std::unique_ptr<RegularAreaRect> TextPage::wordAt(const NormalizedPoint &p) const
1696{
1697 TextEntity::List::ConstIterator itBegin = d->m_words.constBegin(), itEnd = d->m_words.constEnd();
1699 TextEntity::List::ConstIterator posIt = itEnd;
1700 for (; it != itEnd; ++it) {
1701 if (it->area().contains(p.x, p.y)) {
1702 posIt = it;
1703 break;
1704 }
1705 }
1706 if (posIt != itEnd) {
1707 if (posIt->text().simplified().isEmpty()) {
1708 return nullptr;
1709 }
1710 // Find the first TinyTextEntity of the word
1711 while (posIt != itBegin) {
1712 --posIt;
1713 const QString itText = posIt->text();
1714 if (itText.right(1).at(0).isSpace()) {
1715 if (itText.endsWith(QLatin1String("-\n"))) {
1716 // Is an hyphenated word
1717 // continue searching the start of the word back
1718 continue;
1719 }
1720
1721 if (itText == QLatin1String("\n") && posIt != itBegin) {
1722 --posIt;
1723 if (posIt->text().endsWith(QLatin1String("-"))) {
1724 // Is an hyphenated word
1725 // continue searching the start of the word back
1726 continue;
1727 }
1728 ++posIt;
1729 }
1730
1731 ++posIt;
1732 break;
1733 }
1734 }
1735 auto ret = std::make_unique<RegularAreaRect>();
1736 QString foundWord;
1737 for (; posIt != itEnd; ++posIt) {
1738 const QString itText = posIt->text();
1739 if (itText.simplified().isEmpty()) {
1740 break;
1741 }
1742
1743 ret->appendShape(posIt->area());
1744 foundWord += posIt->text();
1745 if (itText.right(1).at(0).isSpace()) {
1746 if (!foundWord.endsWith(QLatin1String("-\n"))) {
1747 break;
1748 }
1749 }
1750 }
1751
1752 return ret;
1753 } else {
1754 return nullptr;
1755 }
1756}
NormalizedPoint is a helper class which stores the coordinates of a normalized point.
Definition area.h:117
double x
The normalized x coordinate.
Definition area.h:166
double y
The normalized y coordinate.
Definition area.h:171
A NormalizedRect is a rectangle which can be defined by two NormalizedPoints.
Definition area.h:189
double bottom
The normalized bottom coordinate.
Definition area.h:435
bool isRight(const NormalizedPoint &pt) const
Returns true if the point pt is located to the left of the right edge of the rectangle.
Definition area.h:376
bool isLeft(const NormalizedPoint &pt) const
Returns true if the point pt is located to the right of the left edge of the rectangle.
Definition area.h:367
bool contains(double x, double y) const
Returns whether the normalized rectangle contains the normalized point (x, y).
Definition area.cpp:160
bool isTop(const NormalizedPoint &pt) const
Returns true if the point pt is located above the top of the rectangle.
Definition area.h:340
bool intersects(const NormalizedRect &other) const
Returns whether the normalized rectangle intersects the other normalized rectangle.
Definition area.cpp:165
bool isTopOrLevel(const NormalizedPoint &pt) const
Returns true if the point pt is located above the bottom of the rectangle.
Definition area.h:358
bool isBottomOrLevel(const NormalizedPoint &pt) const
Returns true if the point pt is located below the top of the rectangle.
Definition area.h:349
QRect geometry(int xScale, int yScale) const
Returns the rectangle mapped to a reference area of xScale x yScale.
Definition area.cpp:232
double top
The normalized top coordinate.
Definition area.h:425
bool isBottom(const NormalizedPoint &pt) const
Returns true if the point pt is located below the bottom of the rectangle.
Definition area.h:331
QRect roundedGeometry(int xScale, int yScale) const
Same functionality as geometry, but the output is now rounded before typecasting to int.
Definition area.cpp:239
void transform(const QTransform &matrix)
Transforms the normalized rectangle with the operations defined by matrix.
Definition area.cpp:253
This is a list of NormalizedRect, to describe an area consisting of multiple rectangles using normali...
Definition area.h:927
void appendShape(const NormalizedShape &shape, MergeSide side=MergeAll)
Appends the given shape to this area.
Definition area.h:804
bool isNull() const
Returns whether the regular area is a null area.
Definition area.h:745
void simplify()
Simplifies this regular area by merging its intersecting subareas.
Definition area.h:724
bool contains(double x, double y) const
Returns whether this area contains the normalized point (x, y).
Definition area.h:860
bool intersects(const RegularArea< NormalizedShape, Shape > *area) const
Returns whether this area intersects with the given area.
Definition area.h:777
Represents a piece of text on a TextPage, containing its textual representation and its bounding box.
Definition textpage.h:53
QString text() const
Returns the text of the text entity.
Definition textpage.cpp:142
TextEntity(const QString &text, const NormalizedRect &area)
Creates a new text entity with the given text and the given area.
Definition textpage.cpp:134
NormalizedRect area() const
Returns the bounding area of the text entity.
Definition textpage.cpp:147
~TextEntity()
Destroys the text entity.
NormalizedRect transformedArea(const QTransform &matrix) const
Returns the transformed area of the text entity.
Definition textpage.cpp:152
TextEntity::List words(const RegularAreaRect *area, TextAreaInclusionBehaviour b) const
Text entity extraction function.
TextAreaInclusionBehaviour
Defines the behaviour of adding characters to text() result.
Definition textpage.h:113
@ AnyPixelTextAreaInclusionBehaviour
A character is included into text() result if any pixel of his bounding box is in the given area.
Definition textpage.h:114
QString text(const RegularAreaRect *area=nullptr) const
Text extraction function.
Definition textpage.cpp:871
std::unique_ptr< RegularAreaRect > wordAt(const NormalizedPoint &p) const
Returns the area and text of the word at the given point Note that ownership of the returned area bel...
RegularAreaRect * findText(int searchID, const QString &query, SearchDirection direction, Qt::CaseSensitivity caseSensitivity, const RegularAreaRect *area)
Returns the bounding rect of the text which matches the following criteria or 0 if the search is not ...
Definition textpage.cpp:549
std::unique_ptr< RegularAreaRect > textArea(const TextSelection &selection) const
Returns the rectangular area of the given selection.
Definition textpage.cpp:277
void append(const QString &text, const NormalizedRect &area)
Appends the given text with the given area as new TextEntity to the page.
Definition textpage.cpp:185
~TextPage()
Destroys the text page.
Definition textpage.cpp:180
TextPage()
Creates a new text page.
Definition textpage.cpp:169
Wrapper around the information needed to generate the selection area.
Definition misc.h:19
NormalizedPoint end() const
Returns the end point of the selection.
Definition misc.cpp:42
NormalizedPoint start() const
Returns the start point of the selection.
Definition misc.cpp:37
Q_SCRIPTABLE Q_NOREPLY void start()
std::optional< QSqlQuery > query(const QString &queryStatement)
KIOCORE_EXPORT QStringList list(const QString &fileClass)
const QList< QKeySequence > & begin()
const QList< QKeySequence > & end()
global.h
Definition action.h:17
SearchDirection
Describes the direction of searching.
Definition global.h:36
@ FromTop
Searching from top of the page, next result is to be found, there was no earlier search result.
Definition global.h:37
@ FromBottom
Searching from bottom of the page, next result is to be found, there was no earlier search result.
Definition global.h:38
@ PreviousResult
Searching for the previous result on the page, earlier result should be located so we search from the...
Definition global.h:40
@ NextResult
Searching for the next result on the page, earlier result should be located so we search from the las...
Definition global.h:39
MergeSide
The side(s) to be considered when merging areas.
Definition global.h:64
@ MergeRight
Merge only if the right side of the first area intersect.
Definition global.h:65
bool isSpace(char32_t ucs4)
void append(QList< T > &&value)
const_reference at(qsizetype i) const const
iterator begin()
iterator end()
iterator erase(const_iterator begin, const_iterator end)
T & first()
iterator insert(const_iterator before, parameter_type value)
qsizetype length() const const
void reserve(qsizetype size)
void shrink_to_fit()
bool contains(const Key &key) const const
Key key(const T &value, const Key &defaultKey) const const
size_type remove(const Key &key)
int x() const const
int y() const const
int bottom() const const
QPoint center() const const
int height() const const
int left() const const
int right() const const
void setBottom(int y)
void setHeight(int height)
void setLeft(int x)
void setRight(int x)
void setTop(int y)
void setWidth(int width)
int top() const const
int width() const const
int x() const const
int y() const const
NormalizationForm_C
QString & append(QChar ch)
const QChar at(qsizetype position) const const
bool endsWith(QChar c, Qt::CaseSensitivity cs) const const
QString & insert(qsizetype position, QChar ch)
bool isEmpty() const const
qsizetype length() const const
QString normalized(NormalizationForm mode, QChar::UnicodeVersion version) const const
QString right(qsizetype n) const const
QString simplified() const const
bool startsWith(QChar c, Qt::CaseSensitivity cs) const const
QStringView mid(qsizetype start, qsizetype length) const const
int compare(QChar ch) const const
CaseInsensitive
QTextStream & left(QTextStream &stream)
QTextStream & right(QTextStream &stream)
This file is part of the KDE documentation.
Documentation copyright © 1996-2025 The KDE developers.
Generated on Fri Jan 3 2025 11:58:07 by doxygen 1.12.0 written by Dimitri van Heesch, © 1997-2006

KDE's Doxygen guidelines are available online.