KFileMetaData

epubextractor.cpp
1 /*
2  SPDX-FileCopyrightText: 2013 Vishesh Handa <[email protected]>
3  SPDX-FileCopyrightText: 2016 Christoph Cullmann <[email protected]>
4 
5  SPDX-License-Identifier: LGPL-2.1-or-later
6 */
7 
8 
9 #include "epubextractor.h"
10 #include "kfilemetadata_debug.h"
11 
12 #include <epub.h>
13 
14 #include <QDateTime>
15 #include <QRegularExpression>
16 
17 using namespace KFileMetaData;
18 
19 EPubExtractor::EPubExtractor(QObject* parent)
20  : ExtractorPlugin(parent)
21 {
22 
23 }
24 
25 namespace
26 {
27 static const QStringList supportedMimeTypes = {
28  QStringLiteral("application/epub+zip"),
29 };
30 
31 const QStringList fetchMetadata(struct epub* e, const epub_metadata& type)
32 {
33  int size = 0;
34  unsigned char** data = epub_get_metadata(e, type, &size);
35  if (data) {
36  QStringList strList;
37  strList.reserve(size);
38  for (int i = 0; i < size; i++) {
39  // skip nullptr entries, can happen for broken xml files
40  // also skip empty entries
41  if (!data[i] || !data[i][0]) {
42  continue;
43  }
44 
45  strList << QString::fromUtf8((char*)data[i]);
46  free(data[i]);
47  }
48  free(data);
49 
50  return strList;
51  }
52  return QStringList();
53 }
54 }
55 
56 QStringList EPubExtractor::mimetypes() const
57 {
58  return supportedMimeTypes;
59 }
60 
61 void EPubExtractor::extract(ExtractionResult* result)
62 {
63  // open epub, return on exit, file will be closed again at end of function
64  auto ePubDoc = epub_open(result->inputUrl().toUtf8().constData(), 1);
65  if (!ePubDoc) {
66  qCWarning(KFILEMETADATA_LOG) << "Invalid document";
67  return;
68  }
69 
70  result->addType(Type::Document);
71 
72  if (result->inputFlags() & ExtractionResult::ExtractMetaData) {
73 
74  for (const QString& value : fetchMetadata(ePubDoc, EPUB_TITLE)) {
75  result->add(Property::Title, value);
76  }
77 
78  for (const QString& value : fetchMetadata(ePubDoc, EPUB_SUBJECT)) {
79  result->add(Property::Subject, value);
80  }
81 
82  for (QString value : fetchMetadata(ePubDoc, EPUB_CREATOR)) {
83  // Prefix added by libepub when no opf:role is specified
84  if (value.startsWith(QLatin1String("Author: "), Qt::CaseSensitive)) {
85  value = value.mid(8).simplified();
86  } else {
87  // Find 'opf:role' prefix added by libepub
88  int index = value.indexOf(QLatin1String(": "), Qt::CaseSensitive);
89  if (index > 0) {
90  value = value.mid(index + 2).simplified();
91  }
92  }
93 
94  // Name is provided as "<name>(<file-as>)" when opf:file-as property
95  // is specified, "<name>(<name>)" otherwise. Strip the last part
96  int index = value.indexOf(QLatin1Char('('));
97  if (index > 0) {
98  value = value.mid(0, index);
99  }
100 
101  result->add(Property::Author, value);
102  }
103 
104  // The Contributor just seems to be mostly Calibre aka the Generator
105  /*
106  value = fetchMetadata(ePubDoc, EPUB_CONTRIB);
107  if( !value.isEmpty() ) {
108  SimpleResource con;
109  con.addType( NCO::Contact() );
110  con.addProperty( NCO::fullname(), value );
111 
112  fileRes.addProperty( NCO::contributor(), con );
113  graph << con;
114  }*/
115 
116  for (const QString& value : fetchMetadata(ePubDoc, EPUB_PUBLISHER)) {
117  result->add(Property::Publisher, value);
118  }
119 
120  for (const QString& value : fetchMetadata(ePubDoc, EPUB_DESCRIPTION)) {
121  result->add(Property::Description, value);
122  }
123 
124  for (QString value : fetchMetadata(ePubDoc, EPUB_DATE)) {
125  if (value.startsWith(QLatin1String("Unspecified:"), Qt::CaseInsensitive)) {
126  value = value.mid(12).simplified();
127  } else if (value.startsWith(QLatin1String("publication:"), Qt::CaseInsensitive)) {
128  value = value.mid(12).simplified();
129  } else {
130  continue;
131  }
133  if (!dt.isNull()) {
134  result->add(Property::CreationDate, dt);
135  result->add(Property::ReleaseYear, dt.date().year());
136  }
137  }
138  }
139 
140  //
141  // Plain Text
142  //
143  if (result->inputFlags() & ExtractionResult::ExtractPlainText) {
144  if (auto iter = epub_get_iterator(ePubDoc, EITERATOR_SPINE, 0)) {
145  do {
146  char* curr = epub_it_get_curr(iter);
147  if (!curr) {
148  continue;
149  }
150 
151  QString html = QString::fromUtf8(curr);
152  html.remove(QRegularExpression(QStringLiteral("<[^>]*>")));
153  result->append(html);
154  } while (epub_it_get_next(iter));
155 
156  epub_free_iterator(iter);
157  }
158 
159  auto tit = epub_get_titerator(ePubDoc, TITERATOR_NAVMAP, 0);
160  if (!tit) {
161  tit = epub_get_titerator(ePubDoc, TITERATOR_GUIDE, 0);
162  }
163  if (tit) {
164  if (epub_tit_curr_valid(tit)) {
165  do {
166  // get link, iterator handles freeing of it
167  char* clink = epub_tit_get_curr_link(tit);
168 
169  // epub_get_data returns -1 on failure
170  char* data = nullptr;
171  const int size = epub_get_data(ePubDoc, clink, &data);
172  if (size >= 0 && data) {
173  QString html = QString::fromUtf8(data, size);
174  // strip html tags
175  html.remove(QRegularExpression(QStringLiteral("<[^>]*>")));
176 
177  result->append(html);
178  free(data);
179  }
180  } while (epub_tit_next(tit));
181  }
182  epub_free_titerator(tit);
183  }
184  }
185 
186  // close epub file again
187  epub_close(ePubDoc);
188 }
static QDateTime dateTimeFromString(const QString &dateString)
Tries to extract a valid date time from the string provided.
virtual void addType(Type::Type type)=0
This function is called by the plugins.
The ExtractionResult class is where all the data extracted by the indexer is saved....
QString fromUtf8(const char *str, int size)
CaseSensitive
int year() const const
bool isNull() const const
QString inputUrl() const
The input url which the plugins will use to locate the file.
void reserve(int alloc)
Flags inputFlags() const
The flags which the extraction plugin should considering following when extracting metadata from the ...
QByteArray toUtf8() const const
QString & remove(int position, int n)
virtual void append(const QString &text)=0
This function is called by plugins when they wish for some plain text to be indexed without any prope...
const char * constData() const const
virtual void add(Property::Property property, const QVariant &value)=0
This function is called by the plugins when they wish to add a key value pair which should be indexed...
QDate date() const const
The ExtractorPlugin is the base class for all file metadata extractors. It is responsible for extract...
This file is part of the KDE documentation.
Documentation copyright © 1996-2022 The KDE developers.
Generated on Fri May 27 2022 03:47:54 by doxygen 1.8.17 written by Dimitri van Heesch, © 1997-2006

KDE's Doxygen guidelines are available online.