KItinerary

pdfdocument.cpp
1/*
2 SPDX-FileCopyrightText: 2018 Volker Krause <vkrause@kde.org>
3
4 SPDX-License-Identifier: LGPL-2.0-or-later
5*/
6
7#include "config-kitinerary.h"
8#include "pdfdocument.h"
9#include "pdfdocument_p.h"
10#include "pdfextractoroutputdevice_p.h"
11#include "pdfimage_p.h"
12#include "popplerglobalparams_p.h"
13#include "popplerutils_p.h"
14#include "logging.h"
15
16#include <QDebug>
17#include <QImage>
18#include <QScopedValueRollback>
19#include <QTimeZone>
20
21#include <DateInfo.h>
22#include <PDFDoc.h>
23#include <PDFDocEncoding.h>
24#include <Stream.h>
25
26#include <cmath>
27
28using namespace KItinerary;
29
30void PdfPagePrivate::load()
31{
32 if (m_loaded) {
33 return;
34 }
35
36 PopplerGlobalParams gp;
37 PdfExtractorOutputDevice device;
38 m_doc->m_popplerDoc->displayPageSlice(&device, m_pageNum + 1, 72, 72, 0, false, true, false, -1, -1, -1, -1);
39 m_doc->m_popplerDoc->processLinks(&device, m_pageNum + 1);
40 device.finalize();
41 const auto pageRect = m_doc->m_popplerDoc->getPage(m_pageNum + 1)->getCropBox();
42 std::unique_ptr<GooString> s(device.getText(pageRect->x1, pageRect->y1, pageRect->x2, pageRect->y2));
43
44#if KPOPPLER_VERSION >= QT_VERSION_CHECK(0, 72, 0)
45 m_text = QString::fromUtf8(s->c_str());
46#else
47 m_text = QString::fromUtf8(s->getCString());
48#endif
49 m_images = std::move(device.m_images);
50 for (auto it = m_images.begin(); it != m_images.end(); ++it) {
51 (*it).d->m_page = this;
52 }
53
54 m_links = std::move(device.m_links);
55 for (auto &link : m_links) {
56 link.convertToPageRect(pageRect);
57 }
58
59 m_loaded = true;
60}
61
62PdfPage::PdfPage()
63 : d(new PdfPagePrivate)
64{
65}
66
67PdfPage::PdfPage(const PdfPage&) = default;
68PdfPage::~PdfPage() = default;
69PdfPage& PdfPage::operator=(const PdfPage&) = default;
70
71QString PdfPage::text() const
72{
73 d->load();
74 return d->m_text;
75}
76
77static double ratio(double begin, double end, double ratio)
78{
79 return begin + (end - begin) * ratio;
80}
81
82QString PdfPage::textInRect(double left, double top, double right, double bottom) const
83{
84 PopplerGlobalParams gp;
85
86 const auto page = d->m_doc->m_popplerDoc->getPage(d->m_pageNum + 1);
87 const auto pageRect = page->getCropBox();
88
89 double l;
90 double t;
91 double r;
92 double b;
93 switch (page->getRotate()) {
94 case 0:
95 l = ratio(pageRect->x1, pageRect->x2, left);
96 t = ratio(pageRect->y1, pageRect->y2, top);
97 r = ratio(pageRect->x1, pageRect->x2, right);
98 b = ratio(pageRect->y1, pageRect->y2, bottom);
99 break;
100 case 90:
101 l = ratio(pageRect->y1, pageRect->y2, left);
102 t = ratio(pageRect->x1, pageRect->x2, top);
103 r = ratio(pageRect->y1, pageRect->y2, right);
104 b = ratio(pageRect->x1, pageRect->x2, bottom);
105 break;
106 default:
107 qCWarning(Log) << "Unsupported page rotation!" << page->getRotate();
108 return {};
109 }
110
111 TextOutputDev device(nullptr, false, 0, false, false);
112 d->m_doc->m_popplerDoc->displayPageSlice(&device, d->m_pageNum + 1, 72, 72, 0, false, true, false, -1, -1, -1, -1);
113 std::unique_ptr<GooString> s(device.getText(l, t, r, b));
114#if KPOPPLER_VERSION >= QT_VERSION_CHECK(0, 72, 0)
115 return QString::fromUtf8(s->c_str());
116#else
117 return QString::fromUtf8(s->getCString());
118#endif
119}
120
122{
123 d->load();
124 return d->m_images.size();
125}
126
127PdfImage PdfPage::image(int index) const
128{
129 d->load();
130 return d->m_images[index];
131}
132
133QVariantList PdfPage::imagesVariant() const
134{
135 d->load();
136 QVariantList l;
137 l.reserve(imageCount());
138 std::for_each(d->m_images.begin(), d->m_images.end(), [&l](const PdfImage& img) { l.push_back(QVariant::fromValue(img)); });
139 return l;
140}
141
142QVariantList PdfPage::imagesInRect(double left, double top, double right, double bottom) const
143{
144 d->load();
145 QVariantList l;
146 PopplerGlobalParams gp;
147 const auto pageRect = d->m_doc->m_popplerDoc->getPage(d->m_pageNum + 1)->getCropBox();
148
149 for (const auto &img : d->m_images) {
150 if ((img.d->m_transform.dx() >= ratio(pageRect->x1, pageRect->x2, left) && img.d->m_transform.dx() <= ratio(pageRect->x1, pageRect->x2, right)) &&
151 (img.d->m_transform.dy() >= ratio(pageRect->y1, pageRect->y2, top) && img.d->m_transform.dy() <= ratio(pageRect->y1, pageRect->y2, bottom)))
152 {
153 l.push_back(QVariant::fromValue(img));
154 }
155 }
156 return l;
157}
158
160{
161 d->load();
162 return d->m_links.size();
163}
164
165PdfLink PdfPage::link(int index) const
166{
167 d->load();
168 return d->m_links[index];
169}
170
171QVariantList PdfPage::linksVariant() const
172{
173 d->load();
174 QVariantList l;
175 l.reserve(d->m_links.size());
176 std::transform(d->m_links.begin(), d->m_links.end(), std::back_inserter(l), [](const PdfLink &link) { return QVariant::fromValue(link); });
177 return l;
178}
179
180QVariantList PdfPage::linksInRect(double left, double top, double right, double bottom) const
181{
182 QRectF bbox(QPointF(left, top), QPointF(right, bottom));
183 d->load();
184
185 QVariantList l;
186 for (const auto &link : d->m_links) {
187 if (!link.area().intersects(bbox)) {
188 continue;
189 }
190 l.push_back(QVariant::fromValue(link));
191 }
192
193 std::sort(l.begin(), l.end(), [](const auto &lhs, const auto &rhs) {
194 const auto lhsLink = lhs.template value<PdfLink>();
195 const auto rhsLink = rhs.template value<PdfLink>();
196 if (lhsLink.area().top() == rhsLink.area().top()) {
197 return lhsLink.area().left() < rhsLink.area().left();
198 }
199 return lhsLink.area().top() < rhsLink.area().top();
200 });
201
202 return l;
203}
204
205static constexpr inline double pdfToMM(double points)
206{
207 return points * 25.4 / 72.0;
208}
209
210int PdfPage::width() const
211{
212 const auto page = d->m_doc->m_popplerDoc->getPage(d->m_pageNum + 1);
213 const auto rot = page->getRotate();
214 if (rot == 90 || rot == 270) {
215 return pdfToMM(page->getCropHeight());
216 }
217 return pdfToMM(page->getCropWidth());
218}
219
220int PdfPage::height() const
221{
222 const auto page = d->m_doc->m_popplerDoc->getPage(d->m_pageNum + 1);
223 const auto rot = page->getRotate();
224 if (rot == 90 || rot == 270) {
225 return pdfToMM(page->getCropWidth());
226 }
227 return pdfToMM(page->getCropHeight());
228}
229
230
231PdfDocument::PdfDocument(QObject *parent)
232 : QObject(parent)
233 , d(new PdfDocumentPrivate)
234{
235}
236
237PdfDocument::~PdfDocument() = default;
238
239QString PdfDocument::text() const
240{
241 QString text;
242 std::for_each(d->m_pages.begin(), d->m_pages.end(), [&text](const PdfPage &p) { text += p.text(); });
243 return text;
244}
245
246int PdfDocument::pageCount() const
247{
248 return d->m_popplerDoc->getNumPages();
249}
250
252{
253 return d->m_pages[index];
254}
255
257{
258 return d->m_pdfData.size();
259}
260
261#if KPOPPLER_VERSION >= QT_VERSION_CHECK(21, 8, 0)
262static QDateTime parsePdfDateTime(const GooString *str)
263#else
264static QDateTime parsePdfDateTime(const char *str)
265#endif
266{
267 int year;
268 int month;
269 int day;
270 int hour;
271 int min;
272 int sec;
273 int tzHours;
274 int tzMins;
275 char tz;
276
277 if (!parseDateString(str, &year, &month, &day, &hour, &min, &sec, &tz, &tzHours, &tzMins)) {
278 return {};
279 }
280
281 QDate date(year, month, day);
282 QTime time(hour, min, sec);
283 if (!date.isValid() || !time.isValid()) {
284 return {};
285 }
286
287 int offset = tzHours * 3600 + tzMins * 60;
288 if (tz == '+') {
289 return QDateTime(date, time, QTimeZone::fromSecondsAheadOfUtc(offset));
290 } else if (tz == '-') {
291 return QDateTime(date, time, QTimeZone::fromSecondsAheadOfUtc(-offset));
292 }
293 return QDateTime(date, time, QTimeZone::UTC);
294}
295
296QDateTime PdfDocument::creationTime() const
297{
298 std::unique_ptr<GooString> dt(d->m_popplerDoc->getDocInfoCreatDate());
299 if (!dt) {
300 return {};
301 }
302#if KPOPPLER_VERSION >= QT_VERSION_CHECK(21, 8, 0)
303 return parsePdfDateTime(dt.get());
304#elif KPOPPLER_VERSION >= QT_VERSION_CHECK(0, 72, 0)
305 return parsePdfDateTime(dt->c_str());
306#else
307 return parsePdfDateTime(dt->getCString());
308#endif
309}
310
311QDateTime PdfDocument::modificationTime() const
312{
313 std::unique_ptr<GooString> dt(d->m_popplerDoc->getDocInfoModDate());
314 if (!dt) {
315 return {};
316 }
317#if KPOPPLER_VERSION >= QT_VERSION_CHECK(21, 8, 0)
318 return parsePdfDateTime(dt.get());
319#elif KPOPPLER_VERSION >= QT_VERSION_CHECK(0, 72, 0)
320 return parsePdfDateTime(dt->c_str());
321#else
322 return parsePdfDateTime(dt->getCString());
323#endif
324}
325
326
327QString gooStringToUnicode(const std::unique_ptr<GooString> &s)
328{
329 if (!s) {
330 return {};
331 }
332
333 if (s->hasUnicodeMarker() || s->hasUnicodeMarkerLE()) {
334 return QString::fromUtf16(reinterpret_cast<const char16_t*>(s->toStr().c_str()), s->toStr().size() / 2);
335 } else {
336 int len = 0;
337 std::unique_ptr<const char[]> utf16Data(pdfDocEncodingToUTF16(s->toStr(), &len));
338 return QString::fromUtf16(reinterpret_cast<const char16_t*>(utf16Data.get()), len / 2);
339 }
340
341 return QString::fromUtf8(s->c_str());
342}
343
344QString PdfDocument::title() const
345{
346 return gooStringToUnicode(d->m_popplerDoc->getDocInfoTitle());
347}
348
349QString PdfDocument::producer() const
350{
351 return gooStringToUnicode(d->m_popplerDoc->getDocInfoProducer());
352}
353
354QString PdfDocument::creator() const
355{
356 return gooStringToUnicode(d->m_popplerDoc->getDocInfoCreator());
357}
358
359QString PdfDocument::author() const
360{
361 return gooStringToUnicode(d->m_popplerDoc->getDocInfoAuthor());
362}
363
364QVariantList PdfDocument::pagesVariant() const
365{
366 QVariantList l;
367 l.reserve(pageCount());
368 std::for_each(d->m_pages.begin(), d->m_pages.end(), [&l](const PdfPage& p) { l.push_back(QVariant::fromValue(p)); });
369 return l;
370}
371
373{
374 PopplerGlobalParams gp;
375
376 std::unique_ptr<PdfDocument> doc(new PdfDocument(parent));
377 doc->d->m_pdfData = data;
378 // PDFDoc takes ownership of stream
379#if KPOPPLER_VERSION >= QT_VERSION_CHECK(0, 58, 0)
380 auto stream = new MemStream(const_cast<char*>(doc->d->m_pdfData.constData()), 0, doc->d->m_pdfData.size(), Object());
381#else
382 Object obj;
383 obj.initNull();
384 auto stream = new MemStream(const_cast<char*>(doc->d->m_pdfData.constData()), 0, doc->d->m_pdfData.size(), &obj);
385#endif
386 std::unique_ptr<PDFDoc> popplerDoc(new PDFDoc(stream));
387 if (!popplerDoc->isOk()) {
388 qCWarning(Log) << "Got invalid PDF document!" << popplerDoc->getErrorCode();
389 return nullptr;
390 }
391
392 doc->d->m_pages.reserve(popplerDoc->getNumPages());
393 for (int i = 0; i < popplerDoc->getNumPages(); ++i) {
395 page.d->m_pageNum = i;
396 page.d->m_doc = doc->d.get();
397 doc->d->m_pages.push_back(page);
398 }
399
400 doc->d->m_popplerDoc = std::move(popplerDoc);
401 return doc.release();
402}
403
405{
406 return data.startsWith("%PDF");
407}
408
409#include "moc_pdfdocument.cpp"
PDF document for extraction.
Definition pdfdocument.h:92
PdfPage page(int index) const
The n-thj page in this document.
int fileSize() const
File size of the entire document in bytes.
static PdfDocument * fromData(const QByteArray &data, QObject *parent=nullptr)
Creates a PdfDocument from the given raw data.
static bool maybePdf(const QByteArray &data)
Fast check whether data might be a PDF document.
An image in a PDF document.
Definition pdfimage.h:73
A page in a PDF document.
Definition pdfdocument.h:29
Q_INVOKABLE QVariantList linksInRect(double left, double top, double right, double bottom) const
Returns all links in the specified sub-rect of this page.
PdfImage image(int index) const
The n-th image found in this document.
int linkCount() const
The number of links found in this document.
Q_INVOKABLE QString textInRect(double left, double top, double right, double bottom) const
Returns the text in the specified sub-rect of this page.
PdfLink link(int index) const
The n-th link found in this document.
int imageCount() const
The number of images found in this document.
Q_INVOKABLE QVariantList imagesInRect(double left, double top, double right, double bottom) const
Returns the images in the specified sub-rect of this page.
KIOCORE_EXPORT CopyJob * link(const QList< QUrl > &src, const QUrl &destDir, JobFlags flags=DefaultFlags)
Classes for reservation/travel data models, data extraction and data augmentation.
Definition berelement.h:17
bool startsWith(const QByteArray &ba) const const
QObject * parent() const const
bool intersects(const QRectF &rectangle) const const
QString fromUtf16(const ushort *unicode, int size)
QString fromUtf8(const char *str, int size)
QVariant fromValue(const T &value)
This file is part of the KDE documentation.
Documentation copyright © 1996-2024 The KDE developers.
Generated on Sun Feb 25 2024 18:40:32 by doxygen 1.10.0 written by Dimitri van Heesch, © 1997-2006

KDE's Doxygen guidelines are available online.