30 #include <QtXml/QDomDocument>
31 #include <QtXml/QXmlStreamReader>
32 #include <Soprano/Vocabulary/NAO>
34 using namespace Soprano::Vocabulary;
35 using namespace Nepomuk2::Vocabulary;
36 using namespace Nepomuk2;
47 list << QLatin1String(
"application/vnd.openxmlformats-officedocument.wordprocessingml.document")
48 << QLatin1String(
"application/vnd.openxmlformats-officedocument.presentationml.presentation")
49 << QLatin1String(
"application/vnd.openxmlformats-officedocument.spreadsheetml.sheet");
58 KZip zip(fileUrl.toLocalFile());
59 if (!zip.open(QIODevice::ReadOnly)) {
60 qWarning() <<
"Document is not a valid ZIP archive";
64 const KArchiveDirectory *rootDir = zip.directory();
66 qWarning() <<
"Invalid document structure (main directory is missing)";
70 const QStringList rootEntries = rootDir->entries();
71 if (!rootEntries.contains(
"docProps")) {
72 qWarning() <<
"Invalid document structure (docProps is missing)";
76 const KArchiveEntry* docPropEntry = rootDir->entry(
"docProps");
77 if( !docPropEntry->isDirectory() ) {
78 qWarning() <<
"Invalid document structure (docProps is not a directory)";
85 const KArchiveDirectory* docPropDirectory =
dynamic_cast<const KArchiveDirectory*
>( docPropEntry );
86 const QStringList docPropsEntries = docPropDirectory->entries();
88 if( docPropsEntries.contains(
"core.xml") ) {
89 QDomDocument coreDoc(
"core");
90 const KArchiveFile *file =
static_cast<const KArchiveFile*
>(docPropDirectory->entry(
"core.xml"));
91 coreDoc.setContent(file->data());
93 QDomElement docElem = coreDoc.documentElement();
95 QDomElement elem = docElem.firstChildElement(
"dc:description");
96 if( !elem.isNull() ) {
97 QString str = elem.text();
98 if( !str.isEmpty() ) {
103 elem = docElem.firstChildElement(
"dc:subject");
104 if( !elem.isNull() ) {
105 QString str = elem.text();
106 if( !str.isEmpty() ) {
111 elem = docElem.firstChildElement(
"dc:title");
112 if( !elem.isNull() ) {
113 QString str = elem.text();
114 if( !str.isEmpty() ) {
119 elem = docElem.firstChildElement(
"dc:creator");
120 if( !elem.isNull() ) {
121 QString str = elem.text();
122 if( !str.isEmpty() ) {
124 creator.
addType( NCO::Contact() );
132 elem = docElem.firstChildElement(
"dc:langauge");
133 if( !elem.isNull() ) {
134 QString str = elem.text();
135 if( !str.isEmpty() ) {
141 if( docPropsEntries.contains(
"app.xml") ) {
142 QDomDocument appDoc(
"app");
143 const KArchiveFile *file =
static_cast<const KArchiveFile*
>(docPropDirectory->entry(
"app.xml"));
144 appDoc.setContent(file->data());
146 QDomElement docElem = appDoc.documentElement();
149 if( mimeType == QLatin1String(
"application/vnd.openxmlformats-officedocument.wordprocessingml.document") ) {
150 QDomElement elem = docElem.firstChildElement(
"Pages");
151 if( !elem.isNull() ) {
153 int pageCount = elem.text().toInt(&ok);
155 fileRes.
setProperty( NFO::pageCount(), pageCount );
159 elem = docElem.firstChildElement(
"Words");
160 if( !elem.isNull() ) {
162 int wordCount = elem.text().toInt(&ok);
164 fileRes.
setProperty( NFO::wordCount(), wordCount );
169 QDomElement elem = docElem.firstChildElement(
"Application");
170 if( !elem.isNull() ) {
171 QString app = elem.text();
172 if( !app.isEmpty() ) {
179 if (rootEntries.contains(
"word")) {
180 const KArchiveEntry* wordEntry = rootDir->entry(
"word");
181 if( !wordEntry->isDirectory() ) {
182 qWarning() <<
"Invalid document structure (word is not a directory)";
186 const KArchiveDirectory* wordDirectory =
dynamic_cast<const KArchiveDirectory*
>( wordEntry );
187 const QStringList wordEntries = wordDirectory->entries();
189 if( wordEntries.contains(
"document.xml") ) {
190 QDomDocument appDoc(
"document");
191 const KArchiveFile *file =
static_cast<const KArchiveFile*
>(wordDirectory->entry(
"document.xml"));
194 QTextStream stream(&plainText);
196 extractTextWithTag(file->createDevice(), QLatin1String(
"w:t"), stream);
197 if( !plainText.isEmpty() )
198 fileRes.
addProperty( NIE::plainTextContent(), plainText );
202 else if( rootEntries.contains(
"xl") ) {
203 const KArchiveEntry* xlEntry = rootDir->entry(
"xl");
204 if( !xlEntry->isDirectory() ) {
205 qWarning() <<
"Invalid document structure (xl is not a directory)";
210 QTextStream stream(&plainText);
212 const KArchiveDirectory* xlDirectory =
dynamic_cast<const KArchiveDirectory*
>( xlEntry );
213 extractTextFromFiles( xlDirectory, stream );
214 if( !plainText.isEmpty() )
215 fileRes.
addProperty( NIE::plainTextContent(), plainText );
218 else if( rootEntries.contains(
"ppt") ) {
219 const KArchiveEntry* pptEntry = rootDir->entry(
"ppt");
220 if( !pptEntry->isDirectory() ) {
221 qWarning() <<
"Invalid document structure (ppt is not a directory)";
226 QTextStream stream(&plainText);
228 const KArchiveDirectory* pptDirectory =
dynamic_cast<const KArchiveDirectory*
>( pptEntry );
229 extractTextFromFiles( pptDirectory, stream );
230 if( !plainText.isEmpty() )
231 fileRes.
addProperty( NIE::plainTextContent(), plainText );
241 void Office2007Extractor::extractAllText(QIODevice* device, QTextStream& stream)
243 QXmlStreamReader xml( device );
245 while( !xml.atEnd() ) {
250 if( xml.isCharacters() ) {
251 QString str = xml.text().toString();
254 if( !str.at(str.length()-1).isSpace() )
255 stream << QLatin1Char(
' ');
258 if( xml.isEndDocument() || xml.hasError() )
263 void Office2007Extractor::extractTextFromFiles(
const KArchiveDirectory* archiveDir, QTextStream& stream)
265 const QStringList entries = archiveDir->entries();
266 foreach(
const QString& entryName, entries) {
267 const KArchiveEntry* entry = archiveDir->entry(entryName);
268 if( entry->isDirectory() ) {
269 const KArchiveDirectory* subDir =
dynamic_cast<const KArchiveDirectory*
>(entry);
270 extractTextFromFiles( subDir, stream );
277 if( !entryName.endsWith(
".xml") )
280 const KArchiveFile* file =
static_cast<const KArchiveFile*
>(entry);
281 extractAllText( file->createDevice(), stream );
285 void Office2007Extractor::extractTextWithTag(QIODevice* device,
const QString& tag, QTextStream& stream)
287 QXmlStreamReader xml( device );
290 while( !xml.atEnd() ) {
295 if( xml.qualifiedName().startsWith(tag) && xml.isStartElement() ) {
296 QString str = xml.readElementText(QXmlStreamReader::IncludeChildElements).simplified();
298 if( !str.isEmpty() ) {
302 if( !str.at(str.length()-1).isSpace() )
303 stream << QLatin1Char(
' ');
307 if( xml.isEndDocument() || xml.hasError() )
void setProperty(const QUrl &property, const QVariant &value)
Set a property overwriting existing values.
Represents a snapshot of one Nepomuk resource.
PropertyHash properties() const
void addProperty(const QUrl &property, const QVariant &value)
Add a property.
void addType(const QUrl &type)
A convenience method which adds a property of type rdf:type.