• Skip to content
  • Skip to link menu
KDE API Reference
  • KDE API Reference
  • kdelibs API Reference
  • KDE Home
  • Contact Us
 

Nepomuk-Core

  • sources
  • kde-4.12
  • kdelibs
  • nepomuk-core
  • services
  • fileindexer
  • indexer
popplerextractor.cpp
Go to the documentation of this file.
1 /*
2  <one line to give the library's name and an idea of what it does.>
3  Copyright (C) 2012 Vishesh Handa <me@vhanda.in>
4  Copyright (C) 2012 Jörg Ehrichs <joerg.ehrichs@gmx.de>
5 
6  This library is free software; you can redistribute it and/or
7  modify it under the terms of the GNU Lesser General Public
8  License as published by the Free Software Foundation; either
9  version 2.1 of the License, or (at your option) any later version.
10 
11  This library is distributed in the hope that it will be useful,
12  but WITHOUT ANY WARRANTY; without even the implied warranty of
13  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14  Lesser General Public License for more details.
15 
16  You should have received a copy of the GNU Lesser General Public
17  License along with this library; if not, write to the Free Software
18  Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
19 */
20 
21 
22 #include "popplerextractor.h"
23 
24 #include "nco.h"
25 #include "nie.h"
26 #include "nfo.h"
27 
28 #include <KDE/KDebug>
29 
30 #include <poppler-qt4.h>
31 
32 using namespace Nepomuk2::Vocabulary;
33 
34 namespace Nepomuk2 {
35 
36 PopplerExtractor::PopplerExtractor(QObject* parent, const QVariantList&)
37 : ExtractorPlugin(parent)
38 {
39 
40 }
41 
42 QStringList PopplerExtractor::mimetypes()
43 {
44  QStringList list;
45  list << QLatin1String("application/pdf");
46 
47  return list;
48 }
49 
50 
51 SimpleResourceGraph PopplerExtractor::extract(const QUrl& resUri, const QUrl& fileUrl, const QString& mimeType)
52 {
53  Q_UNUSED( mimeType );
54 
55  SimpleResourceGraph graph;
56  SimpleResource fileRes( resUri );
57 
58  Poppler::Document* pdfDoc = Poppler::Document::load( fileUrl.toLocalFile(), 0, 0 );
59 
60  if ( !pdfDoc || pdfDoc->isLocked() ){
61  delete pdfDoc;
62  return graph;
63  }
64 
65  QString title = pdfDoc->info(QLatin1String("Title")).trimmed();
66 
67  // The title extracted from the pdf metadata is in many cases not the real title
68  // of the document. Especially for research papers that are exported to pdf.
69  // As mostly the title of a pdf document is written on the first page in the biggest font
70  // we use this if the pdfDoc title is considered junk
71  if(title.isEmpty() ||
72  !title.contains(' ') || // very unlikely the title of a document does only contain one word.
73  title.contains(QLatin1String("Microsoft"), Qt::CaseInsensitive)) { // most research papers i found written with microsoft word
74  // have a garbage title of the pdf creator rather than the real document title
75  title = parseFirstPage(pdfDoc, fileUrl);
76  }
77 
78  if( !title.isEmpty() ) {
79  fileRes.addProperty( NIE::title(), title );
80  }
81 
82  QString subject = pdfDoc->info(QLatin1String("Subject"));
83  if( !subject.isEmpty() ) {
84  fileRes.addProperty( NIE::subject(), subject );
85  }
86 
87  QString creator = pdfDoc->info(QLatin1String("Author"));
88  if( !creator.isEmpty() ) {
89  SimpleResource res;
90  res.addType( NCO::Contact() );
91  res.addProperty( NCO::fullname(), creator );
92  graph << res;
93 
94  fileRes.addProperty( NCO::creator(), res );
95  }
96 
97  QString generator = pdfDoc->info(QLatin1String("Creator"));
98  if( !creator.isEmpty() ) {
99  SimpleResource res;
100  res.addType( NCO::Contact() );
101  res.addProperty( NCO::fullname(), creator );
102  graph << res;
103 
104  fileRes.addProperty( NIE::generator(), res );
105  }
106 
107  QString plainTextContent;
108  for( int i=0; i<pdfDoc->numPages(); i++ ) {
109  if( plainTextContent.size() >= maxPlainTextSize() ) {
110  break;
111  }
112 
113  Poppler::Page* page = pdfDoc->page( i );
114  if(!page) { // broken pdf files do not return a valid page
115  kWarning() << "Could not read page content from" << fileUrl;
116  break;
117  }
118  plainTextContent.append( page->text( QRectF() ) );
119  delete page;
120  }
121 
122  if( !plainTextContent.isEmpty() ) {
123  fileRes.addProperty( NIE::plainTextContent(), plainTextContent );
124  }
125 
126  fileRes.addType( NFO::PaginatedTextDocument() );
127 
128  delete pdfDoc;
129 
130  graph << fileRes;
131  return graph;
132 }
133 
134 QString PopplerExtractor::parseFirstPage(Poppler::Document* pdfDoc, const QUrl& fileUrl)
135 {
136  Poppler::Page *p = pdfDoc->page(0);
137 
138  if(!p) {
139  kWarning() << "Could not read page content from" << fileUrl;
140  return QString();
141  }
142 
143  QList<Poppler::TextBox*> tbList = p->textList();
144  QMap<int, QString> possibleTitleMap;
145 
146  int currentLargestChar = 0;
147  int skipTextboxes = 0;
148 
149  // Iterate over all textboxes. Each textbox can be a single character/word or textblock
150  // Here we combine the etxtboxes back together based on the textsize
151  // Important are the words with the biggest font size
152  foreach (Poppler::TextBox * tb, tbList) {
153 
154  // if we added followup words, skip the textboxes here now
155  if (skipTextboxes > 0) {
156  skipTextboxes--;
157  continue;
158  }
159 
160  int height = tb->charBoundingBox(0).height();
161 
162  // if the following text is smaller than the biggest we found up to now, ignore it
163  if (height >= currentLargestChar) {
164  QString possibleTitle;
165  possibleTitle.append(tb->text());
166  currentLargestChar = height;
167 
168  // if the text has follow up words add them to to create the full title
169  Poppler::TextBox * next = tb->nextWord();
170  while (next) {
171  possibleTitle.append(QLatin1String(" "));
172  possibleTitle.append(next->text());
173  next = next->nextWord();
174  skipTextboxes++;
175  }
176 
177  // now combine text for each font size together, very likeley it must be connected
178  QString existingTitlePart = possibleTitleMap.value(currentLargestChar, QString());
179  existingTitlePart.append(QLatin1String(" "));
180  existingTitlePart.append(possibleTitle);
181  possibleTitleMap.insert(currentLargestChar, existingTitlePart);
182  }
183  }
184 
185  qDeleteAll(tbList);
186  delete p;
187 
188  QList<int> titleSizes = possibleTitleMap.keys();
189  qSort(titleSizes.begin(), titleSizes.end(), qGreater<int>());
190 
191  QString newPossibleTitle;
192 
193  // find the text with the largest font that is not just 1 character
194  foreach (int i, titleSizes) {
195  QString title = possibleTitleMap.value(i);
196 
197  // sometime the biggest part is a single letter
198  // as a starting paragraph letter
199  if (title.size() < 5) {
200  continue;
201  } else {
202  newPossibleTitle = title.trimmed();
203  break;
204  }
205  }
206 
207  // Sometimes the titles that are extracted are too large. This is a way of trimming them.
208  newPossibleTitle.truncate( 50 );
209  return newPossibleTitle;
210 }
211 
212 }
213 
214 NEPOMUK_EXPORT_EXTRACTOR( Nepomuk2::PopplerExtractor, "nepomukpopplerextractor" )
Nepomuk2::ExtractorPlugin
The ExtractorPlugin is the base class for all file metadata extractors.
Definition: extractorplugin.h:60
Nepomuk2::SimpleResource
Represents a snapshot of one Nepomuk resource.
Definition: simpleresource.h:46
QObject
Nepomuk2::SimpleResource::addProperty
void addProperty(const QUrl &property, const QVariant &value)
Add a property.
Definition: simpleresource.cpp:206
Nepomuk2::SimpleResourceGraph
Definition: simpleresourcegraph.h:48
NEPOMUK_EXPORT_EXTRACTOR
#define NEPOMUK_EXPORT_EXTRACTOR(classname, libname)
Export a Nepomuk file extractor.
Definition: extractorplugin.h:163
Nepomuk2::ExtractorPlugin::maxPlainTextSize
static int maxPlainTextSize()
Virtuoso does not support streaming operators, and does not accept queries above a certain size...
Definition: extractorplugin.cpp:124
Nepomuk2::SimpleResource::addType
void addType(const QUrl &type)
A convenience method which adds a property of type rdf:type.
Definition: simpleresource.cpp:257
Nepomuk2::PopplerExtractor::PopplerExtractor
PopplerExtractor(QObject *parent, const QVariantList &)
Definition: popplerextractor.cpp:36
Nepomuk2::PopplerExtractor::mimetypes
virtual QStringList mimetypes()
Provide a list of mimetypes which are supported by this plugin.
Definition: popplerextractor.cpp:42
Nepomuk2::PopplerExtractor::extract
virtual SimpleResourceGraph extract(const QUrl &resUri, const QUrl &fileUrl, const QString &mimeType)
The main function of the plugin that is responsible for extracting the data from the file url and ret...
Definition: popplerextractor.cpp:51
popplerextractor.h
Nepomuk2::PopplerExtractor
Definition: popplerextractor.h:32
This file is part of the KDE documentation.
Documentation copyright © 1996-2014 The KDE developers.
Generated on Tue Oct 14 2014 22:48:08 by doxygen 1.8.7 written by Dimitri van Heesch, © 1997-2006

KDE's Doxygen guidelines are available online.

Nepomuk-Core

Skip menu "Nepomuk-Core"
  • Main Page
  • Namespace List
  • Namespace Members
  • Alphabetical List
  • Class List
  • Class Hierarchy
  • Class Members
  • File List
  • File Members
  • Modules
  • Related Pages

kdelibs API Reference

Skip menu "kdelibs API Reference"
  • DNSSD
  • Interfaces
  •   KHexEdit
  •   KMediaPlayer
  •   KSpeech
  •   KTextEditor
  • kconf_update
  • KDE3Support
  •   KUnitTest
  • KDECore
  • KDED
  • KDEsu
  • KDEUI
  • KDEWebKit
  • KDocTools
  • KFile
  • KHTML
  • KImgIO
  • KInit
  • kio
  • KIOSlave
  • KJS
  •   KJS-API
  • kjsembed
  •   WTF
  • KNewStuff
  • KParts
  • KPty
  • Kross
  • KUnitConversion
  • KUtils
  • Nepomuk
  • Nepomuk-Core
  • Nepomuk
  • Plasma
  • Solid
  • Sonnet
  • ThreadWeaver

Search



Report problems with this website to our bug tracking system.
Contact the specific authors with questions and comments about the page contents.

KDE® and the K Desktop Environment® logo are registered trademarks of KDE e.V. | Legal