KItinerary

pdfdocumentprocessor.cpp
1/*
2 SPDX-FileCopyrightText: 2018-2021 Volker Krause <vkrause@kde.org>
3
4 SPDX-License-Identifier: LGPL-2.0-or-later
5*/
6
7#include "pdfdocumentprocessor.h"
8
9#include "barcodedocumentprocessorhelper.h"
10#include "genericpriceextractorhelper_p.h"
11
12#include "pdf/pdfbarcodeutil_p.h"
13#include "text/nameoptimizer_p.h"
14
15#include <KItinerary/BarcodeDecoder>
16#include <KItinerary/ExtractorDocumentNodeFactory>
17#include <KItinerary/ExtractorEngine>
18#include <KItinerary/ExtractorResult>
19#include <KItinerary/PdfDocument>
20
21#include <QImage>
22#include <QJSEngine>
23
24
25using namespace KItinerary;
26
27Q_DECLARE_METATYPE(KItinerary::Internal::OwnedPtr<KItinerary::PdfDocument>)
28
29enum {
30 MaxPageCount = 10, // maximum in the current test set is 6
31 MaxFileSize = 10000000, // maximum in the current test set is ~9MB
32};
33
34PdfDocumentProcessor::PdfDocumentProcessor() = default;
35PdfDocumentProcessor::~PdfDocumentProcessor() = default;
36
37bool PdfDocumentProcessor::canHandleData(const QByteArray &encodedData, QStringView fileName) const
38{
39 return PdfDocument::maybePdf(encodedData) ||
41}
42
43static void applyContextDateTime(PdfDocument *pdf, ExtractorDocumentNode &node)
44{
45 // ignore broken PDF times for Amadeus documents
46 if (pdf->producer() == QLatin1StringView("Amadeus") &&
47 pdf->creationTime() == pdf->modificationTime() &&
48 pdf->creationTime().date().year() <= 2013) {
49 return;
50 }
51
52 // ignore broken PDF times for this specific PDFsharp version (used by Eurowings)
53 if (pdf->producer() == QLatin1StringView("PDFsharp 1.32.2602-g (www.pdfsharp.net)")
54 && pdf->creationTime().date() == QDate(2019, 05, 02)) {
55 return;
56 }
57
58 auto dt = pdf->modificationTime();
59 if (!dt.isValid()) {
60 dt = pdf->creationTime();
61 }
62 if (dt.isValid() && dt.date().year() > 2000 && dt < QDateTime::currentDateTime()) {
63 node.setContextDateTime(dt);
64 }
65}
66
68{
69 auto pdf = PdfDocument::fromData(encodedData);
70 // stay away from documents that are atypically large for what we are looking for
71 // that's just unnecessarily eating up resources
72 if (!pdf || pdf->pageCount() > MaxPageCount || pdf->fileSize() > MaxFileSize) {
73 delete pdf;
74 return {};
75 }
76
78 node.setContent<Internal::OwnedPtr<PdfDocument>>(pdf);
79 applyContextDateTime(pdf, node);
80 return node;
81}
82
84{
85 auto pdf = decodedData.value<PdfDocument*>();
86 // stay away from documents that are atypically large for what we are looking for
87 // that's just unnecessarily eating up resources
88 if (!pdf || pdf->pageCount() > MaxPageCount || pdf->fileSize() > MaxFileSize) {
89 return {};
90 }
91
93 node.setContent(pdf);
94 applyContextDateTime(pdf, node);
95 return node;
96}
97
99{
100 const auto doc = node.content<PdfDocument*>();
101
102 for (int i = 0; i < doc->pageCount(); ++i) {
103 const auto page = doc->page(i);
104 m_imageIds.clear();
105
106 for (int j = 0; j < page.imageCount(); ++j) {
107 auto img = page.image(j);
108 img.setLoadingHints(PdfImage::AbortOnColorHint | PdfImage::ConvertToGrayscaleHint); // we only care about b/w-ish images for barcode detection
109 if (img.hasObjectId() && m_imageIds.find(img.objectId()) != m_imageIds.end()) {
110 continue;
111 }
112
113 const auto barcodeHints = PdfBarcodeUtil::maybeBarcode(img, BarcodeDecoder::Any2D | BarcodeDecoder::Any1D);
114 if (barcodeHints == BarcodeDecoder::None) {
115 continue;
116 }
117
118 const auto imgData = img.image();
119 if (imgData.isNull()) { // can happen due to AbortOnColorHint
120 continue;
121 }
122
123 auto childNode = engine->documentNodeFactory()->createNode(imgData, u"internal/qimage");
124 childNode.setLocation(i);
125 node.appendChild(childNode); // TODO the old code de-duplicated repeated barcodes here - do we actually need that?
126 if (img.hasObjectId()) {
127 m_imageIds.insert(img.objectId());
128 }
129
130 // technically not our job to do this here rather than letting the image node processor handle this
131 // but we have the output aspect ratio of the barcode only here, which gives better decoding hints
132 if (BarcodeDocumentProcessorHelper::expandNode(imgData, barcodeHints, childNode, engine)) {
133 continue;
134 }
135
136 // if this failed, check if the image as a aspect-ratio distorting scale and try again with that
137 if (img.hasAspectRatioTransform()) {
138 BarcodeDocumentProcessorHelper::expandNode(img.applyAspectRatioTransform(imgData), barcodeHints, childNode, engine);
139 }
140 }
141
142 // handle full page raster images (ignoring masks)
143 int imageCount = 0;
144 for (auto i = 0; i < page.imageCount(); ++i) {
145 if (page.image(i).type() == PdfImageType::Image) {
146 ++imageCount;
147 }
148 }
149 if ((engine->hints() & ExtractorEngine::ExtractFullPageRasterImages) && imageCount == 1 && page.text().isEmpty()) {
150 qDebug() << "full page raster image";
151 auto img = page.image(0);
152 if (img.hasObjectId() && m_imageIds.find(img.objectId()) != m_imageIds.end()) { // already handled
153 continue;
154 }
155
156 img.setLoadingHints(PdfImage::NoHint); // don't abort on color
157 const auto imgData = img.image();
158 if (imgData.isNull()) {
159 continue;
160 }
161
162 auto childNode = engine->documentNodeFactory()->createNode(imgData, u"internal/qimage");
163 childNode.setLocation(i);
164 node.appendChild(childNode);
165 if (img.hasObjectId()) {
166 m_imageIds.insert(img.objectId());
167 }
168 }
169 }
170
171 // fallback node for implicit conversion to plain text
172 auto fallback = engine->documentNodeFactory()->createNode(doc->text(), u"text/plain");
173 node.appendChild(fallback);
174}
175
176void PdfDocumentProcessor::postExtract(ExtractorDocumentNode &node, [[maybe_unused]] const ExtractorEngine *engine) const
177{
178 // find the text node we can run the optimizer on
179 if (node.childNodes().empty() || node.result().isEmpty()) {
180 return;
181 }
182 const QString text = node.childNodes().back().content<QString>();
183
184 // run name optimizer on all results
185 QList<QVariant> result;
186 const auto res = node.result().result();
187 result.reserve(res.size());
188 for (const auto &r : res) {
189 result.push_back(NameOptimizer::optimizeNameRecursive(text, r));
190 }
191 node.setResult(std::move(result));
192
193 // look for price data, if we have chance of that being unambiguous
194 const auto doc = node.content<PdfDocument*>();
195 if (node.result().size() == 1 || doc->pageCount() == 1) {
196 GenericPriceExtractorHelper::postExtract(text, node);
197 }
198}
199
204
206{
207 destroyIfOwned<PdfDocument>(node);
208}
ExtractorDocumentNode createNode(const QByteArray &data, QStringView fileName={}, QStringView mimeType={}) const
Create a new document node from data.
A node in the extracted document object tree.
QJsonArray result
Result access for QJSEngine.
void setResult(ExtractorResult &&result)
Replace the existing results by result.
void appendChild(ExtractorDocumentNode &child)
Add another child node.
void setContextDateTime(const QDateTime &contextDateTime)
Set the context date/time.
QJSValue content
The decoded content of this node.
QVariantList childNodes
Child nodes, for QJSEngine access.
void setContent(const QVariant &content)
Set decoded content.
void setLocation(const QVariant &location)
Set the location information.
Semantic data extraction engine.
@ ExtractFullPageRasterImages
perform expensive image processing on (PDF) documents containing full page raster images
Hints hints() const
The currently set extraction hints.
const ExtractorDocumentNodeFactory * documentNodeFactory() const
Factory for creating new document nodes.
void postExtract(ExtractorDocumentNode &node, const ExtractorEngine *engine) const override
Called after extractors have been applied to node.
bool canHandleData(const QByteArray &encodedData, QStringView fileName) const override
Fast check whether the given encoded data can possibly be processed by this instance.
QJSValue contentToScriptValue(const ExtractorDocumentNode &node, QJSEngine *engine) const override
Create a QJSValue for the node content.
void destroyNode(ExtractorDocumentNode &node) const override
Destroys type-specific data in node.
void expandNode(ExtractorDocumentNode &node, const ExtractorEngine *engine) const override
Create child nodes for node, as far as that's necessary for this document type.
ExtractorDocumentNode createNodeFromData(const QByteArray &encodedData) const override
Create a document node from raw data.
ExtractorDocumentNode createNodeFromContent(const QVariant &decodedData) const override
Create a document node from an already decoded data type.
PDF document for extraction.
Definition pdfdocument.h:92
int fileSize() const
File size of the entire document in bytes.
static PdfDocument * fromData(const QByteArray &data, QObject *parent=nullptr)
Creates a PdfDocument from the given raw data.
static bool maybePdf(const QByteArray &data)
Fast check whether data might be a PDF document.
@ NoHint
Load image data as-is. The default.
Definition pdfimage.h:101
@ AbortOnColorHint
Abort loading when encountering a non black/white pixel, as a shortcut for barcode detection.
Definition pdfimage.h:102
@ ConvertToGrayscaleHint
Convert to QImage::Format_Grayscale8 during loading. More efficient than converting later if all you ...
Definition pdfimage.h:103
Classes for reservation/travel data models, data extraction and data augmentation.
Definition berelement.h:17
int year() const const
QDateTime currentDateTime()
QDate date() const const
QJSValue toScriptValue(const T &value)
bool isEmpty() const const
qsizetype size() const const
bool endsWith(QChar ch) const const
CaseInsensitive
T value() const const
This file is part of the KDE documentation.
Documentation copyright © 1996-2024 The KDE developers.
Generated on Mon Nov 4 2024 16:28:48 by doxygen 1.12.0 written by Dimitri van Heesch, © 1997-2006

KDE's Doxygen guidelines are available online.