okular
#include "textpage.h"
#include "textpage_p.h"
#include <kdebug.h>
#include "area.h"
#include "debug_p.h"
#include "misc.h"
#include "page.h"
#include "page_p.h"
#include <cstring>
#include <QtAlgorithms>
#include <QVarLengthArray>

Go to the source code of this file.
Typedefs | |
typedef QList< WordWithCharacters > | WordsWithCharacters |
Functions | |
WordsWithCharacters | addNecessarySpace (RegionTextList tree, int pageWidth, int pageHeight) |
static void | calculateStatisticalInformation (const QList< WordWithCharacters > &words, int pageWidth, int pageHeight, int *word_spacing, int *line_spacing, int *col_spacing) |
static bool | CaseInsensitiveCmpFn (const QStringRef &from, const QStringRef &to) |
static bool | CaseSensitiveCmpFn (const QStringRef &from, const QStringRef &to) |
static bool | compareTinyTextEntityX (const WordWithCharacters &first, const WordWithCharacters &second) |
static bool | compareTinyTextEntityY (const WordWithCharacters &first, const WordWithCharacters &second) |
static bool | doesConsumeY (const QRect &first, const QRect &second, int threshold) |
QList< QPair < WordsWithCharacters, QRect > > | makeAndSortLines (const WordsWithCharacters &wordsTmp, int pageWidth, int pageHeight) |
static WordsWithCharacters | makeWordFromCharacters (const TextList &characters, int pageWidth, int pageHeight) |
static void | removeSpace (TextList *words) |
static int | stringLengthAdaptedWithHyphen (const QString &str, const TextList::ConstIterator &it, const TextList::ConstIterator &textListEnd, PagePrivate *page) |
static RegionTextList | XYCutForBoundingBoxes (const QList< WordWithCharacters > &wordsWithCharacters, const NormalizedRect &boundingBox, int pageWidth, int pageHeight) |
Typedef Documentation
typedef QList<WordWithCharacters> WordsWithCharacters |
Definition at line 266 of file textpage.cpp.
Function Documentation
WordsWithCharacters addNecessarySpace | ( | RegionTextList | tree, |
int | pageWidth, | ||
int | pageHeight | ||
) |
Add spaces in between words in a line.
It reuses the pointers passed in tree and might add new ones. You will need to take care of deleting them if needed
- Call makeAndSortLines before adding spaces in between words in a line
- Now add spaces between every two words in a line
- Finally, extract all the space separated texts from each region and return it
Definition at line 1796 of file textpage.cpp.
|
static |
Calculate Statistical information from the lines we made previously.
For the region, defined by line_rects and lines
- Make line statistical analysis to find the line spacing
- Make character statistical analysis to differentiate between word spacing and column spacing.
Step 0
Step 1
Step 2
Definition at line 1359 of file textpage.cpp.
|
static |
Definition at line 55 of file textpage.cpp.
|
static |
Definition at line 60 of file textpage.cpp.
|
static |
Definition at line 1100 of file textpage.cpp.
|
static |
Definition at line 1108 of file textpage.cpp.
|
static |
If the vertical arm of one rectangle fully contains the other (example below) -----— -— --— first -— -----— --— second or we can make it overlap of spaces by threshold%.
Definition at line 71 of file textpage.cpp.
QList< QPair<WordsWithCharacters, QRect> > makeAndSortLines | ( | const WordsWithCharacters & | wordsTmp, |
int | pageWidth, | ||
int | pageHeight | ||
) |
Create Lines from the words and sort them.
We cannot assume that the generator will give us texts in the right order. We can only assume that we will get texts in the page and their bounding rectangle. The texts can be character, word, half-word anything. So, we need to:
- Sort rectangles/boxes containing texts by y0(top)
- Create textline where there is y overlap between TinyTextEntity 's
- Within each line sort the TinyTextEntity 's by x0(left)
Definition at line 1265 of file textpage.cpp.
|
static |
We will read the TinyTextEntity from characters and try to create words from there.
Note: characters might be already characters for some generators, but we will keep the nomenclature characters for the generator produced data. The resulting WordsWithCharacters memory has to be managed by the caller, both the WordWithCharacters::word and WordWithCharacters::characters contents
We will traverse characters and try to create words from the TinyTextEntities in it. We will search TinyTextEntity blocks and merge them until we get a space between two consecutive TinyTextEntities. When we get a space we can take it as a end of word. Then we store the word as a TinyTextEntity and keep it in newList.
We create a RegionText named regionWord that contains the word and the characters associated with it and a rectangle area of the element in newList.
Definition at line 1154 of file textpage.cpp.
|
static |
Remove all the spaces in between texts.
It will make all the generators same, whether they save spaces(like pdf) or not(like djvu).
Definition at line 1129 of file textpage.cpp.
|
static |
Definition at line 767 of file textpage.cpp.
|
static |
Implements the XY Cut algorithm for textpage segmentation The resulting RegionTextList will contain RegionText whose WordsWithCharacters::word and WordsWithCharacters::characters are reused from wordsWithCharacters (i.e.
no new nor delete happens in this function)
- calculation of projection profiles
- Cleanup Boundary White Spaces and removal of noise
- Find the Widest gap
- Cut the region and make nodes (left,right) or (up,down)
Definition at line 1533 of file textpage.cpp.
Documentation copyright © 1996-2014 The KDE developers.
Generated on Tue Oct 14 2014 22:45:03 by doxygen 1.8.7 written by Dimitri van Heesch, © 1997-2006
KDE's Doxygen guidelines are available online.