KFileMetaData

epubextractor.cpp
1 /*
2  Copyright (C) 2013 Vishesh Handa <[email protected]>
3  Copyright (C) 2016 Christoph Cullmann <[email protected]>
4 
5  This library is free software; you can redistribute it and/or
6  modify it under the terms of the GNU Lesser General Public
7  License as published by the Free Software Foundation; either
8  version 2.1 of the License, or (at your option) any later version.
9 
10  This library is distributed in the hope that it will be useful,
11  but WITHOUT ANY WARRANTY; without even the implied warranty of
12  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13  Lesser General Public License for more details.
14 
15  You should have received a copy of the GNU Lesser General Public
16  License along with this library; if not, write to the Free Software
17  Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
18 */
19 
20 
21 #include "epubextractor.h"
22 
23 #include <epub.h>
24 
25 #include <QDateTime>
26 #include <QRegularExpression>
27 #include <QDebug>
28 
29 using namespace KFileMetaData;
30 
31 EPubExtractor::EPubExtractor(QObject* parent)
32  : ExtractorPlugin(parent)
33 {
34 
35 }
36 
37 namespace
38 {
39 static const QStringList supportedMimeTypes = {
40  QStringLiteral("application/epub+zip"),
41 };
42 
43 QString fetchMetadata(struct epub* e, const epub_metadata& type)
44 {
45  int size = 0;
46  unsigned char** data = epub_get_metadata(e, type, &size);
47  if (data) {
48  QStringList strList;
49  for (int i = 0; i < size; i++) {
50  // skip nullptr entries, can happen for broken xml files
51  if (!data[i])
52  continue;
53 
54  strList << QString::fromUtf8((char*)data[i]);
55  free(data[i]);
56  }
57  free(data);
58 
59  return strList.join(QLatin1String(", "));
60  }
61  return QString();
62 }
63 }
64 
65 QStringList EPubExtractor::mimetypes() const
66 {
67  return supportedMimeTypes;
68 }
69 
70 void EPubExtractor::extract(ExtractionResult* result)
71 {
72  // open epub, return on exit, file will be closed again at end of function
73  auto ePubDoc = epub_open(result->inputUrl().toUtf8().constData(), 1);
74  if (!ePubDoc) {
75  qWarning() << "Invalid document";
76  return;
77  }
78 
79  result->addType(Type::Document);
80 
81  if (result->inputFlags() & ExtractionResult::ExtractMetaData) {
82 
83  QString value = fetchMetadata(ePubDoc, EPUB_TITLE);
84  if (!value.isEmpty()) {
85  result->add(Property::Title, value);
86  }
87 
88  value = fetchMetadata(ePubDoc, EPUB_SUBJECT);
89  if (!value.isEmpty()) {
90  result->add(Property::Subject, value);
91  }
92 
93  value = fetchMetadata(ePubDoc, EPUB_CREATOR);
94  if (!value.isEmpty()) {
95  if (value.startsWith(QLatin1String("aut:"), Qt::CaseInsensitive)) {
96  value = value.mid(4).simplified();
97  } else if (value.startsWith(QLatin1String("author:"), Qt::CaseInsensitive)) {
98  value = value.mid(7).simplified();
99  }
100 
101  // A lot of authors have their name written in () again. We discard that part
102  int index = value.indexOf(QLatin1Char('('));
103  if (index)
104  value = value.mid(0, index);
105 
106  result->add(Property::Author, value);
107  }
108 
109  // The Contributor just seems to be mostly Calibre aka the Generator
110  /*
111  value = fetchMetadata(ePubDoc, EPUB_CONTRIB);
112  if( !value.isEmpty() ) {
113  SimpleResource con;
114  con.addType( NCO::Contact() );
115  con.addProperty( NCO::fullname(), value );
116 
117  fileRes.addProperty( NCO::contributor(), con );
118  graph << con;
119  }*/
120 
121  value = fetchMetadata(ePubDoc, EPUB_PUBLISHER);
122  if (!value.isEmpty()) {
123  result->add(Property::Publisher, value);
124  }
125 
126  value = fetchMetadata(ePubDoc, EPUB_DESCRIPTION);
127  if (!value.isEmpty()) {
128  result->add(Property::Description, value);
129  }
130 
131  value = fetchMetadata(ePubDoc, EPUB_DATE);
132  if (!value.isEmpty()) {
133  if (value.startsWith(QLatin1String("Unspecified:"), Qt::CaseInsensitive)) {
134  value = value.mid(QByteArray("Unspecified:").size()).simplified();
135  }
136  int ind = value.indexOf(QLatin1String("publication:"), Qt::CaseInsensitive);
137  if (ind != -1) {
138  value = value.mid(ind + QByteArray("publication:").size()).simplified();
139  }
141  if (!dt.isNull()) {
142  result->add(Property::CreationDate, dt);
143  result->add(Property::ReleaseYear, dt.date().year());
144  }
145  }
146  }
147 
148  //
149  // Plain Text
150  //
151  if (result->inputFlags() & ExtractionResult::ExtractPlainText) {
152  if (auto iter = epub_get_iterator(ePubDoc, EITERATOR_SPINE, 0)) {
153  do {
154  char* curr = epub_it_get_curr(iter);
155  if (!curr)
156  continue;
157 
158  QString html = QString::fromUtf8(curr);
159  html.remove(QRegularExpression(QStringLiteral("<[^>]*>")));
160  result->append(html);
161  } while (epub_it_get_next(iter));
162 
163  epub_free_iterator(iter);
164  }
165 
166  auto tit = epub_get_titerator(ePubDoc, TITERATOR_NAVMAP, 0);
167  if (!tit) {
168  tit = epub_get_titerator(ePubDoc, TITERATOR_GUIDE, 0);
169  }
170  if (tit) {
171  if (epub_tit_curr_valid(tit)) {
172  do {
173  // get link, iterator handles freeing of it
174  char* clink = epub_tit_get_curr_link(tit);
175 
176  // epub_get_data returns -1 on failure
177  char* data = nullptr;
178  const int size = epub_get_data(ePubDoc, clink, &data);
179  if (size >= 0 && data) {
180  QString html = QString::fromUtf8(data, size);
181  // strip html tags
182  html.remove(QRegularExpression(QStringLiteral("<[^>]*>")));
183 
184  result->append(html);
185  free(data);
186  }
187  } while (epub_tit_next(tit));
188  }
189  epub_free_titerator(tit);
190  }
191  }
192 
193  // close epub file again
194  epub_close(ePubDoc);
195 }
int indexOf(QChar ch, int from, Qt::CaseSensitivity cs) const const
static QDateTime dateTimeFromString(const QString &dateString)
Tries to extract a valid date time from the string provided.
virtual void append(const QString &text)=0
This function is called by plugins when they wish for some plain text to be indexed without any prope...
virtual void add(Property::Property property, const QVariant &value)=0
This function is called by the plugins when they wish to add a key value pair which should be indexed...
The ExtractorPlugin is the base class for all file metadata extractors.
QString simplified() const const
QString join(const QString &separator) const const
QString & remove(int position, int n)
virtual void addType(Type::Type type)=0
This function is called by the plugins.
QString fromUtf8(const char *str, int size)
bool isEmpty() const const
const char * constData() const const
bool startsWith(const QString &s, Qt::CaseSensitivity cs) const const
bool isNull() const const
QString mid(int position, int n) const const
QDate date() const const
int year() const const
The ExtractionResult class is where all the data extracted by the indexer is saved.
Flags inputFlags() const
The flags which the extraction plugin should considering following when extracting metadata from the ...
QString inputUrl() const
The input url which the plugins will use to locate the file.
QByteArray toUtf8() const const
This file is part of the KDE documentation.
Documentation copyright © 1996-2020 The KDE developers.
Generated on Fri Jun 5 2020 22:55:38 by doxygen 1.8.11 written by Dimitri van Heesch, © 1997-2006

KDE's Doxygen guidelines are available online.