KItinerary

pdfdocumentprocessor.cpp
1/*
2 SPDX-FileCopyrightText: 2018-2021 Volker Krause <vkrause@kde.org>
3
4 SPDX-License-Identifier: LGPL-2.0-or-later
5*/
6
7#include "pdfdocumentprocessor.h"
8
9#include "barcodedocumentprocessorhelper.h"
10#include "genericpriceextractorhelper_p.h"
11
12#include "pdf/pdfbarcodeutil_p.h"
13#include "text/nameoptimizer_p.h"
14
15#include <KItinerary/BarcodeDecoder>
16#include <KItinerary/ExtractorDocumentNodeFactory>
17#include <KItinerary/ExtractorEngine>
18#include <KItinerary/ExtractorResult>
19#include <KItinerary/PdfDocument>
20
21#include <QImage>
22#include <QJSEngine>
23
24
25using namespace KItinerary;
26
27Q_DECLARE_METATYPE(KItinerary::Internal::OwnedPtr<KItinerary::PdfDocument>)
28
29enum {
30 MaxPageCount = 10, // maximum in the current test set is 6
31 MaxFileSize = 10000000, // maximum in the current test set is ~9MB
32};
33
34PdfDocumentProcessor::PdfDocumentProcessor() = default;
35PdfDocumentProcessor::~PdfDocumentProcessor() = default;
36
37bool PdfDocumentProcessor::canHandleData(const QByteArray &encodedData, QStringView fileName) const
38{
39 return PdfDocument::maybePdf(encodedData) ||
41}
42
43static void applyContextDateTime(PdfDocument *pdf, ExtractorDocumentNode &node)
44{
45 // ignore broken PDF times for Amadeus documents
46 if (pdf->producer() == QLatin1StringView("Amadeus") &&
47 pdf->creationTime() == pdf->modificationTime() &&
48 pdf->creationTime().date().year() <= 2013) {
49 return;
50 }
51
52 auto dt = pdf->modificationTime();
53 if (!dt.isValid()) {
54 dt = pdf->creationTime();
55 }
56 if (dt.isValid() && dt.date().year() > 2000 && dt < QDateTime::currentDateTime()) {
57 node.setContextDateTime(dt);
58 }
59}
60
62{
63 auto pdf = PdfDocument::fromData(encodedData);
64 // stay away from documents that are atypically large for what we are looking for
65 // that's just unnecessarily eating up resources
66 if (!pdf || pdf->pageCount() > MaxPageCount || pdf->fileSize() > MaxFileSize) {
67 delete pdf;
68 return {};
69 }
70
72 node.setContent<Internal::OwnedPtr<PdfDocument>>(pdf);
73 applyContextDateTime(pdf, node);
74 return node;
75}
76
78{
79 auto pdf = decodedData.value<PdfDocument*>();
80 // stay away from documents that are atypically large for what we are looking for
81 // that's just unnecessarily eating up resources
82 if (!pdf || pdf->pageCount() > MaxPageCount || pdf->fileSize() > MaxFileSize) {
83 return {};
84 }
85
87 node.setContent(pdf);
88 applyContextDateTime(pdf, node);
89 return node;
90}
91
93{
94 const auto doc = node.content<PdfDocument*>();
95
96 for (int i = 0; i < doc->pageCount(); ++i) {
97 const auto page = doc->page(i);
98 m_imageIds.clear();
99
100 for (int j = 0; j < page.imageCount(); ++j) {
101 auto img = page.image(j);
102 img.setLoadingHints(PdfImage::AbortOnColorHint | PdfImage::ConvertToGrayscaleHint); // we only care about b/w-ish images for barcode detection
103 if (img.hasObjectId() && m_imageIds.find(img.objectId()) != m_imageIds.end()) {
104 continue;
105 }
106
107 const auto barcodeHints = PdfBarcodeUtil::maybeBarcode(img, BarcodeDecoder::Any2D | BarcodeDecoder::Any1D);
108 if (barcodeHints == BarcodeDecoder::None) {
109 continue;
110 }
111
112 const auto imgData = img.image();
113 if (imgData.isNull()) { // can happen due to AbortOnColorHint
114 continue;
115 }
116
117 auto childNode = engine->documentNodeFactory()->createNode(imgData, u"internal/qimage");
118 childNode.setLocation(i);
119 node.appendChild(childNode); // TODO the old code de-duplicated repeated barcodes here - do we actually need that?
120 if (img.hasObjectId()) {
121 m_imageIds.insert(img.objectId());
122 }
123
124 // technically not our job to do this here rather than letting the image node processor handle this
125 // but we have the output aspect ratio of the barcode only here, which gives better decoding hints
126 if (BarcodeDocumentProcessorHelper::expandNode(imgData, barcodeHints, childNode, engine)) {
127 continue;
128 }
129
130 // if this failed, check if the image as a aspect-ratio distorting scale and try again with that
131 if (img.hasAspectRatioTransform()) {
132 BarcodeDocumentProcessorHelper::expandNode(img.applyAspectRatioTransform(imgData), barcodeHints, childNode, engine);
133 }
134 }
135
136 // handle full page raster images
137 if ((engine->hints() & ExtractorEngine::ExtractFullPageRasterImages) && page.imageCount() == 1 && page.text().isEmpty()) {
138 qDebug() << "full page raster image";
139 auto img = page.image(0);
140 if (img.hasObjectId() && m_imageIds.find(img.objectId()) != m_imageIds.end()) { // already handled
141 continue;
142 }
143
144 img.setLoadingHints(PdfImage::NoHint); // don't abort on color
145 const auto imgData = img.image();
146 if (imgData.isNull()) {
147 continue;
148 }
149
150 auto childNode = engine->documentNodeFactory()->createNode(imgData, u"internal/qimage");
151 childNode.setLocation(i);
152 node.appendChild(childNode);
153 if (img.hasObjectId()) {
154 m_imageIds.insert(img.objectId());
155 }
156 }
157 }
158
159 // fallback node for implicit conversion to plain text
160 auto fallback = engine->documentNodeFactory()->createNode(doc->text(), u"text/plain");
161 node.appendChild(fallback);
162}
163
164void PdfDocumentProcessor::postExtract(ExtractorDocumentNode &node, [[maybe_unused]] const ExtractorEngine *engine) const
165{
166 // find the text node we can run the optimizer on
167 if (node.childNodes().empty() || node.result().isEmpty()) {
168 return;
169 }
170 const QString text = node.childNodes().back().content<QString>();
171
172 // run name optimizer on all results
173 QList<QVariant> result;
174 const auto res = node.result().result();
175 result.reserve(res.size());
176 for (const auto &r : res) {
177 result.push_back(NameOptimizer::optimizeNameRecursive(text, r));
178 }
179 node.setResult(std::move(result));
180
181 // look for price data, if we have chance of that being unambiguous
182 const auto doc = node.content<PdfDocument*>();
183 if (node.result().size() == 1 || doc->pageCount() == 1) {
184 GenericPriceExtractorHelper::postExtract(text, node);
185 }
186}
187
192
194{
195 destroyIfOwned<PdfDocument>(node);
196}
ExtractorDocumentNode createNode(const QByteArray &data, QStringView fileName={}, QStringView mimeType={}) const
Create a new document node from data.
A node in the extracted document object tree.
QJsonArray result
Result access for QJSEngine.
void setResult(ExtractorResult &&result)
Replace the existing results by result.
void appendChild(ExtractorDocumentNode &child)
Add another child node.
void setContextDateTime(const QDateTime &contextDateTime)
Set the context date/time.
QJSValue content
The decoded content of this node.
QVariantList childNodes
Child nodes, for QJSEngine access.
void setContent(const QVariant &content)
Set decoded content.
void setLocation(const QVariant &location)
Set the location information.
Semantic data extraction engine.
@ ExtractFullPageRasterImages
perform expensive image processing on (PDF) documents containing full page raster images
Hints hints() const
The currently set extraction hints.
const ExtractorDocumentNodeFactory * documentNodeFactory() const
Factory for creating new document nodes.
void postExtract(ExtractorDocumentNode &node, const ExtractorEngine *engine) const override
Called after extractors have been applied to node.
bool canHandleData(const QByteArray &encodedData, QStringView fileName) const override
Fast check whether the given encoded data can possibly be processed by this instance.
QJSValue contentToScriptValue(const ExtractorDocumentNode &node, QJSEngine *engine) const override
Create a QJSValue for the node content.
void destroyNode(ExtractorDocumentNode &node) const override
Destroys type-specific data in node.
void expandNode(ExtractorDocumentNode &node, const ExtractorEngine *engine) const override
Create child nodes for node, as far as that's necessary for this document type.
ExtractorDocumentNode createNodeFromData(const QByteArray &encodedData) const override
Create a document node from raw data.
ExtractorDocumentNode createNodeFromContent(const QVariant &decodedData) const override
Create a document node from an already decoded data type.
PDF document for extraction.
Definition pdfdocument.h:92
PdfPage page(int index) const
The n-thj page in this document.
int fileSize() const
File size of the entire document in bytes.
static PdfDocument * fromData(const QByteArray &data, QObject *parent=nullptr)
Creates a PdfDocument from the given raw data.
static bool maybePdf(const QByteArray &data)
Fast check whether data might be a PDF document.
@ NoHint
Load image data as-is. The default.
Definition pdfimage.h:100
@ AbortOnColorHint
Abort loading when encountering a non black/white pixel, as a shortcut for barcode detection.
Definition pdfimage.h:101
@ ConvertToGrayscaleHint
Convert to QImage::Format_Grayscale8 during loading. More efficient than converting later if all you ...
Definition pdfimage.h:102
Classes for reservation/travel data models, data extraction and data augmentation.
Definition berelement.h:17
int year() const const
QDateTime currentDateTime()
QDate date() const const
QJSValue toScriptValue(const T &value)
bool isEmpty() const const
void push_back(const QJsonValue &value)
qsizetype size() const const
bool endsWith(QChar ch) const const
CaseInsensitive
T value() const const
This file is part of the KDE documentation.
Documentation copyright © 1996-2024 The KDE developers.
Generated on Tue Mar 26 2024 11:14:49 by doxygen 1.10.0 written by Dimitri van Heesch, © 1997-2006

KDE's Doxygen guidelines are available online.