7#include "config-kitinerary.h"
8#include "pdfdocument.h"
9#include "pdfdocument_p.h"
10#include "pdfextractoroutputdevice_p.h"
11#include "pdfimage_p.h"
12#include "popplerglobalparams_p.h"
13#include "popplerutils_p.h"
18#include <QScopedValueRollback>
23#include <PDFDocEncoding.h>
31void PdfPagePrivate::load()
37 PopplerGlobalParams gp;
38 PdfExtractorOutputDevice device;
39 m_doc->m_popplerDoc->displayPageSlice(&device, m_pageNum + 1, 72, 72, 0,
false,
true,
false, -1, -1, -1, -1);
40 m_doc->m_popplerDoc->processLinks(&device, m_pageNum + 1);
42 const auto pageRect = m_doc->m_popplerDoc->getPage(m_pageNum + 1)->getCropBox();
43 std::unique_ptr<GooString> s(device.getText(pageRect->x1, pageRect->y1, pageRect->x2, pageRect->y2));
46 m_images = std::move(device.m_images);
47 for (
auto it = m_images.begin(); it != m_images.end(); ++it) {
48 (*it).d->m_page =
this;
51 m_links = std::move(device.m_links);
52 for (
auto &link : m_links) {
53 link.convertToPageRect(pageRect);
60 : d(new PdfPagePrivate)
64PdfPage::PdfPage(
const PdfPage&) =
default;
65PdfPage::~PdfPage() =
default;
74static double ratio(
double begin,
double end,
double ratio)
76 return begin + (end - begin) * ratio;
81 PopplerGlobalParams gp;
83 const auto page = d->m_doc->m_popplerDoc->getPage(d->m_pageNum + 1);
84 const auto pageRect = page->getCropBox();
90 switch (page->getRotate()) {
92 l = ratio(pageRect->x1, pageRect->x2, left);
93 t = ratio(pageRect->y1, pageRect->y2, top);
94 r = ratio(pageRect->x1, pageRect->x2, right);
95 b = ratio(pageRect->y1, pageRect->y2, bottom);
98 l = ratio(pageRect->y1, pageRect->y2, left);
99 t = ratio(pageRect->x1, pageRect->x2, top);
100 r = ratio(pageRect->y1, pageRect->y2, right);
101 b = ratio(pageRect->x1, pageRect->x2, bottom);
104 qCWarning(
Log) <<
"Unsupported page rotation!" << page->getRotate();
108 TextOutputDev device(
nullptr,
false, 0,
false,
false);
109 d->m_doc->m_popplerDoc->displayPageSlice(&device, d->m_pageNum + 1, 72, 72, 0,
false,
true,
false, -1, -1, -1, -1);
110 std::unique_ptr<GooString> s(device.getText(l, t, r, b));
117 return d->m_images.size();
123 return d->m_images[index];
126QVariantList PdfPage::imagesVariant()
const
131 std::for_each(d->m_images.begin(), d->m_images.end(), [&l](
const PdfImage& img) { l.push_back(QVariant::fromValue(img)); });
139 PopplerGlobalParams gp;
140 const auto pageRect = d->m_doc->m_popplerDoc->getPage(d->m_pageNum + 1)->getCropBox();
142 for (
const auto &img : d->m_images) {
143 if ((img.d->m_transform.dx() >= ratio(pageRect->x1, pageRect->x2, left) && img.d->m_transform.dx() <= ratio(pageRect->x1, pageRect->x2, right)) &&
144 (img.d->m_transform.dy() >= ratio(pageRect->y1, pageRect->y2, top) && img.d->m_transform.dy() <= ratio(pageRect->y1, pageRect->y2, bottom)))
155 return d->m_links.size();
161 return d->m_links[index];
164QVariantList PdfPage::linksVariant()
const
168 l.reserve(d->m_links.size());
169 std::transform(d->m_links.begin(), d->m_links.end(), std::back_inserter(l), [](
const PdfLink &
link) { return QVariant::fromValue(link); });
179 for (
const auto &
link : d->m_links) {
186 std::sort(l.begin(), l.end(), [](
const auto &lhs,
const auto &rhs) {
187 const auto lhsLink = lhs.template value<PdfLink>();
188 const auto rhsLink = rhs.template value<PdfLink>();
189 if (lhsLink.area().top() == rhsLink.area().top()) {
190 return lhsLink.area().left() < rhsLink.area().left();
192 return lhsLink.area().top() < rhsLink.area().top();
198static constexpr inline double pdfToMM(
double points)
200 return points * 25.4 / 72.0;
203int PdfPage::width()
const
205 const auto page = d->m_doc->m_popplerDoc->getPage(d->m_pageNum + 1);
206 const auto rot = page->getRotate();
207 if (rot == 90 || rot == 270) {
208 return pdfToMM(page->getCropHeight());
210 return pdfToMM(page->getCropWidth());
213int PdfPage::height()
const
215 const auto page = d->m_doc->m_popplerDoc->getPage(d->m_pageNum + 1);
216 const auto rot = page->getRotate();
217 if (rot == 90 || rot == 270) {
218 return pdfToMM(page->getCropWidth());
220 return pdfToMM(page->getCropHeight());
224PdfDocument::PdfDocument(
QObject *parent)
226 , d(new PdfDocumentPrivate)
230PdfDocument::~PdfDocument() =
default;
235 std::for_each(d->m_pages.begin(), d->m_pages.end(), [&text](
const PdfPage &p) { text += p.text(); });
239int PdfDocument::pageCount()
const
241 return d->m_popplerDoc->getNumPages();
246 return d->m_pages[index];
251 return d->m_pdfData.size();
254static QDateTime parsePdfDateTime(
const GooString *str)
266 if (!parseDateString(str, &year, &month, &day, &hour, &min, &sec, &tz, &tzHours, &tzMins)) {
270 QDate date(year, month, day);
271 QTime time(hour, min, sec);
272 if (!date.isValid() || !time.isValid()) {
276 int offset = tzHours * 3600 + tzMins * 60;
279 }
else if (tz ==
'-') {
287 std::unique_ptr<GooString> dt(d->m_popplerDoc->getDocInfoCreatDate());
291 return parsePdfDateTime(dt.get());
296 std::unique_ptr<GooString> dt(d->m_popplerDoc->getDocInfoModDate());
300 return parsePdfDateTime(dt.get());
304QString gooStringToUnicode(
const std::unique_ptr<GooString> &s)
310#if KPOPPLER_VERSION >= QT_VERSION_CHECK(24, 5, 0)
311 if (hasUnicodeByteOrderMark(s->toStr()) || hasUnicodeByteOrderMarkLE(s->toStr())) {
313 if (s->hasUnicodeMarker() || s->hasUnicodeMarkerLE()) {
315 return QString::fromUtf16(
reinterpret_cast<const char16_t*
>(s->toStr().c_str()), s->toStr().size() / 2);
318 std::unique_ptr<const char[]> utf16Data(pdfDocEncodingToUTF16(s->toStr(), &len));
319 return QString::fromUtf16(
reinterpret_cast<const char16_t*
>(utf16Data.get()), len / 2);
327 return gooStringToUnicode(d->m_popplerDoc->getDocInfoTitle());
332 return gooStringToUnicode(d->m_popplerDoc->getDocInfoProducer());
337 return gooStringToUnicode(d->m_popplerDoc->getDocInfoCreator());
342 return gooStringToUnicode(d->m_popplerDoc->getDocInfoAuthor());
345QVariantList PdfDocument::pagesVariant()
const
348 l.reserve(pageCount());
349 std::for_each(d->m_pages.begin(), d->m_pages.end(), [&l](
const PdfPage& p) { l.push_back(QVariant::fromValue(p)); });
355 PopplerGlobalParams gp;
358 doc->d->m_pdfData = data;
360 auto stream =
new MemStream(
const_cast<char*
>(doc->d->m_pdfData.constData()), 0, doc->d->m_pdfData.size(), Object());
361 std::unique_ptr<PDFDoc> popplerDoc(
new PDFDoc(stream));
362 if (!popplerDoc->isOk()) {
363 qCWarning(
Log) <<
"Got invalid PDF document!" << popplerDoc->getErrorCode();
367 doc->d->m_pages.reserve(popplerDoc->getNumPages());
368 for (
int i = 0; i < popplerDoc->getNumPages(); ++i) {
370 page.d->m_pageNum = i;
371 page.d->m_doc = doc->d.get();
372 doc->d->m_pages.push_back(
page);
375 doc->d->m_popplerDoc = std::move(popplerDoc);
376 return doc.release();
384#include "moc_pdfdocument.cpp"
PDF document for extraction.
PdfPage page(int index) const
The n-thj page in this document.
int fileSize() const
File size of the entire document in bytes.
static PdfDocument * fromData(const QByteArray &data, QObject *parent=nullptr)
Creates a PdfDocument from the given raw data.
static bool maybePdf(const QByteArray &data)
Fast check whether data might be a PDF document.
An image in a PDF document.
An external link in a PDF file.
A page in a PDF document.
Q_INVOKABLE QVariantList linksInRect(double left, double top, double right, double bottom) const
Returns all links in the specified sub-rect of this page.
PdfImage image(int index) const
The n-th image found in this document.
int linkCount() const
The number of links found in this document.
Q_INVOKABLE QString textInRect(double left, double top, double right, double bottom) const
Returns the text in the specified sub-rect of this page.
PdfLink link(int index) const
The n-th link found in this document.
int imageCount() const
The number of images found in this document.
Q_INVOKABLE QVariantList imagesInRect(double left, double top, double right, double bottom) const
Returns the images in the specified sub-rect of this page.
KIOCORE_EXPORT CopyJob * link(const QList< QUrl > &src, const QUrl &destDir, JobFlags flags=DefaultFlags)
Classes for reservation/travel data models, data extraction and data augmentation.
bool startsWith(QByteArrayView bv) const const
QObject * parent() const const
bool intersects(const QRectF &rectangle) const const
QString fromUtf16(const char16_t *unicode, qsizetype size)
QString fromUtf8(QByteArrayView str)
QTimeZone fromSecondsAheadOfUtc(int offset)
QVariant fromValue(T &&value)