• Skip to content
  • Skip to link menu
KDE API Reference
  • KDE API Reference
  • kdelibs API Reference
  • KDE Home
  • Contact Us
 

Nepomuk-Core

  • sources
  • kde-4.12
  • kdelibs
  • nepomuk-core
  • services
  • fileindexer
  • indexer
epubextractor.cpp
Go to the documentation of this file.
1 /*
2  Copyright (C) 2013 Vishesh Handa <me@vhanda.in>
3 
4  This library is free software; you can redistribute it and/or
5  modify it under the terms of the GNU Lesser General Public
6  License as published by the Free Software Foundation; either
7  version 2.1 of the License, or (at your option) any later version.
8 
9  This library is distributed in the hope that it will be useful,
10  but WITHOUT ANY WARRANTY; without even the implied warranty of
11  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
12  Lesser General Public License for more details.
13 
14  You should have received a copy of the GNU Lesser General Public
15  License along with this library; if not, write to the Free Software
16  Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
17 */
18 
19 
20 #include "epubextractor.h"
21 
22 #include <epub.h>
23 
24 #include "nie.h"
25 #include "nfo.h"
26 #include "nco.h"
27 
28 #include <Soprano/Vocabulary/NAO>
29 
30 #include <KDebug>
31 #include <QtCore/QDateTime>
32 #include <QTextDocument>
33 
34 using namespace Nepomuk2::Vocabulary;
35 using namespace Soprano::Vocabulary;
36 
37 namespace Nepomuk2 {
38 
39 EPubExtractor::EPubExtractor(QObject* parent, const QVariantList& )
40  : ExtractorPlugin(parent)
41 {
42 
43 }
44 
45 QStringList EPubExtractor::mimetypes()
46 {
47  QStringList types;
48  types << QLatin1String("application/epub+zip");
49 
50  return types;
51 }
52 
53 namespace {
54  QString fetchMetadata(struct epub* e, const epub_metadata& type) {
55  int size = 0;
56 
57  unsigned char** data = epub_get_metadata(e, type, &size);
58  if( data ) {
59  QStringList strList;
60  for( int i=0; i<size; i++ ) {
61  strList << QString::fromUtf8((char*)data[i]);
62  free(data[i]);
63  }
64  free(data);
65 
66  return strList.join(";");
67  }
68  return QString();
69  }
70 }
71 
72 SimpleResourceGraph EPubExtractor::extract(const QUrl& resUri, const QUrl& fileUrl, const QString& mimeType)
73 {
74  Q_UNUSED( mimeType );
75 
76  struct epub* ePubDoc = epub_open( fileUrl.toLocalFile().toUtf8().constData(), 1 );
77  if( !ePubDoc ) {
78  kError() << "Invalid document";
79  return SimpleResourceGraph();
80  }
81 
82  SimpleResource fileRes(resUri);
83  SimpleResourceGraph graph;
84 
85  QString value = fetchMetadata(ePubDoc, EPUB_TITLE);
86  if( !value.isEmpty() ) {
87  fileRes.addProperty( NIE::title(), value );
88  }
89 
90  value = fetchMetadata(ePubDoc, EPUB_SUBJECT);
91  if( !value.isEmpty() ) {
92  fileRes.addProperty( NIE::subject(), value );
93  }
94 
95  value = fetchMetadata(ePubDoc, EPUB_CREATOR);
96  if( !value.isEmpty() ) {
97  if( value.startsWith(QLatin1String("aut:"), Qt::CaseInsensitive) ) {
98  value = value.mid(4).simplified();
99  }
100  else if( value.startsWith(QLatin1String("author:"), Qt::CaseInsensitive) ) {
101  value = value.mid(7).simplified();
102  }
103 
104  // A lot of authors have their name written in () again. We discard that part
105  int index = value.indexOf( '(' );
106  if( index )
107  value = value.mid( 0, index );
108 
109  SimpleResource con;
110  con.addType( NCO::Contact() );
111  con.addProperty( NCO::fullname(), value );
112 
113  fileRes.addProperty( NCO::creator(), con );
114  graph << con;
115  }
116 
117  // The Contributor just seems to be mostly Calibre aka the Generator
118  /*
119  value = fetchMetadata(ePubDoc, EPUB_CONTRIB);
120  if( !value.isEmpty() ) {
121  SimpleResource con;
122  con.addType( NCO::Contact() );
123  con.addProperty( NCO::fullname(), value );
124 
125  fileRes.addProperty( NCO::contributor(), con );
126  graph << con;
127  }*/
128 
129  value = fetchMetadata(ePubDoc, EPUB_PUBLISHER);
130  if( !value.isEmpty() ) {
131  SimpleResource con;
132  con.addType( NCO::Contact() );
133  con.addProperty( NCO::fullname(), value );
134 
135  fileRes.addProperty( NCO::publisher(), con );
136  graph << con;
137  }
138 
139  value = fetchMetadata(ePubDoc, EPUB_DESCRIPTION);
140  if( !value.isEmpty() ) {
141  // nao:description is used for user visible comments. This field is generally
142  // a huge summary of the ebook
143  fileRes.addProperty( NIE::comment(), value );
144  }
145 
146  value = fetchMetadata(ePubDoc, EPUB_DATE);
147  if( !value.isEmpty() ) {
148  if( value.startsWith("Unspecified:", Qt::CaseInsensitive) ) {
149  value = value.mid( QString("Unspecified:").size() ).simplified();
150  }
151  int ind = value.indexOf("publication:", Qt::CaseInsensitive);
152  if( ind != -1) {
153  value = value.mid( ind + QString("publication:").size() ).simplified();
154  }
155  QDateTime dt = ExtractorPlugin::dateTimeFromString(value);
156  if( !dt.isNull() )
157  fileRes.addProperty( NIE::contentCreated(), dt );
158  }
159 
160  //
161  // Plain Text
162  //
163 
164  QString plainText;
165 
166  struct eiterator* iter = epub_get_iterator(ePubDoc, EITERATOR_SPINE, 0 );
167  do {
168  char * curr = epub_it_get_curr(iter);
169  if (!curr)
170  continue;
171  QString html = QString::fromUtf8(curr);
172 
173  QTextDocument doc;
174  doc.setHtml( html );
175  plainText.append( doc.toPlainText() + "\n" );
176 
177  if( plainText.size() >= maxPlainTextSize() )
178  break;
179 
180  } while( epub_it_get_next(iter) );
181 
182  epub_free_iterator(iter);
183 
184  struct titerator* tit;
185 
186  tit = epub_get_titerator(ePubDoc, TITERATOR_NAVMAP, 0 );
187  if( !tit ) {
188  tit = epub_get_titerator(ePubDoc, TITERATOR_GUIDE, 0 );
189  }
190 
191  if( epub_tit_curr_valid(tit) ) {
192  do {
193  char *clink = epub_tit_get_curr_link(tit);
194 
195  char* data;
196  int size = epub_get_data(ePubDoc, clink, &data);
197  free(clink);
198 
199  // epub_get_data returns -1 on failure
200  if( size > 0 && data ) {
201  QString html = QString::fromUtf8(data, size);
202 
203  QTextDocument doc;
204  doc.setHtml( html );
205  plainText.append( doc.toPlainText() + "\n" );
206  free(data);
207 
208  if( plainText.size() >= maxPlainTextSize() )
209  break;
210  }
211  } while( epub_tit_next(tit) );
212  }
213  epub_free_titerator(tit);
214 
215  if( !plainText.isEmpty() )
216  fileRes.addProperty( NIE::plainTextContent(), plainText );
217 
218  if( fileRes.isValid() )
219  graph << fileRes;
220  return graph;
221 }
222 
223 }
224 
225 NEPOMUK_EXPORT_EXTRACTOR( Nepomuk2::EPubExtractor, "nepomukepubextractor" )
epubextractor.h
Nepomuk2::ExtractorPlugin
The ExtractorPlugin is the base class for all file metadata extractors.
Definition: extractorplugin.h:60
Nepomuk2::SimpleResource
Represents a snapshot of one Nepomuk resource.
Definition: simpleresource.h:46
QObject
Nepomuk2::EPubExtractor
Definition: epubextractor.h:28
Nepomuk2::SimpleResource::addProperty
void addProperty(const QUrl &property, const QVariant &value)
Add a property.
Definition: simpleresource.cpp:206
Nepomuk2::SimpleResourceGraph
Definition: simpleresourcegraph.h:48
Nepomuk2::SimpleResource::isValid
bool isValid() const
Definition: simpleresource.cpp:132
NEPOMUK_EXPORT_EXTRACTOR
#define NEPOMUK_EXPORT_EXTRACTOR(classname, libname)
Export a Nepomuk file extractor.
Definition: extractorplugin.h:163
Nepomuk2::EPubExtractor::extract
virtual SimpleResourceGraph extract(const QUrl &resUri, const QUrl &fileUrl, const QString &mimeType)
The main function of the plugin that is responsible for extracting the data from the file url and ret...
Definition: epubextractor.cpp:72
Nepomuk2::EPubExtractor::EPubExtractor
EPubExtractor(QObject *parent, const QVariantList &)
Definition: epubextractor.cpp:39
Nepomuk2::ExtractorPlugin::maxPlainTextSize
static int maxPlainTextSize()
Virtuoso does not support streaming operators, and does not accept queries above a certain size...
Definition: extractorplugin.cpp:124
Nepomuk2::EPubExtractor::mimetypes
virtual QStringList mimetypes()
Provide a list of mimetypes which are supported by this plugin.
Definition: epubextractor.cpp:45
Nepomuk2::ExtractorPlugin::dateTimeFromString
static QDateTime dateTimeFromString(const QString &dateString)
Tries to extract a valid date time from the string provided.
Definition: extractorplugin.cpp:59
Nepomuk2::SimpleResource::addType
void addType(const QUrl &type)
A convenience method which adds a property of type rdf:type.
Definition: simpleresource.cpp:257
This file is part of the KDE documentation.
Documentation copyright © 1996-2014 The KDE developers.
Generated on Tue Oct 14 2014 22:48:08 by doxygen 1.8.7 written by Dimitri van Heesch, © 1997-2006

KDE's Doxygen guidelines are available online.

Nepomuk-Core

Skip menu "Nepomuk-Core"
  • Main Page
  • Namespace List
  • Namespace Members
  • Alphabetical List
  • Class List
  • Class Hierarchy
  • Class Members
  • File List
  • File Members
  • Modules
  • Related Pages

kdelibs API Reference

Skip menu "kdelibs API Reference"
  • DNSSD
  • Interfaces
  •   KHexEdit
  •   KMediaPlayer
  •   KSpeech
  •   KTextEditor
  • kconf_update
  • KDE3Support
  •   KUnitTest
  • KDECore
  • KDED
  • KDEsu
  • KDEUI
  • KDEWebKit
  • KDocTools
  • KFile
  • KHTML
  • KImgIO
  • KInit
  • kio
  • KIOSlave
  • KJS
  •   KJS-API
  • kjsembed
  •   WTF
  • KNewStuff
  • KParts
  • KPty
  • Kross
  • KUnitConversion
  • KUtils
  • Nepomuk
  • Nepomuk-Core
  • Nepomuk
  • Plasma
  • Solid
  • Sonnet
  • ThreadWeaver

Search



Report problems with this website to our bug tracking system.
Contact the specific authors with questions and comments about the page contents.

KDE® and the K Desktop Environment® logo are registered trademarks of KDE e.V. | Legal