KCoreAddons

ktexttohtml.cpp
1/*
2 SPDX-FileCopyrightText: 2002 Dave Corrie <kde@davecorrie.com>
3 SPDX-FileCopyrightText: 2014 Daniel Vrátil <dvratil@redhat.com>
4
5 SPDX-License-Identifier: LGPL-2.0-or-later
6*/
7
8#include "ktexttohtml.h"
9#include "kemoticonsparser_p.h"
10#include "ktexttohtml_p.h"
11
12#include <QCoreApplication>
13#include <QFile>
14#include <QRegularExpression>
15#include <QStringList>
16
17#include <limits.h>
18
19KTextToHTMLHelper::KTextToHTMLHelper(const QString &plainText, int pos, int maxUrlLen, int maxAddressLen)
20 : mText(plainText)
21 , mMaxUrlLen(maxUrlLen)
22 , mMaxAddressLen(maxAddressLen)
23 , mPos(pos)
24{
25}
26
27QString KTextToHTMLHelper::getEmailAddress()
28{
30
31 if (mPos < mText.length() && mText.at(mPos) == QLatin1Char('@')) {
32 // the following characters are allowed in a dot-atom (RFC 2822):
33 // a-z A-Z 0-9 . ! # $ % & ' * + - / = ? ^ _ ` { | } ~
34 const QString allowedSpecialChars = QStringLiteral(".!#$%&'*+-/=?^_`{|}~");
35
36 // determine the local part of the email address
37 int start = mPos - 1;
38 while (start >= 0 && mText.at(start).unicode() < 128
39 && (mText.at(start).isLetterOrNumber() //
40 || mText.at(start) == QLatin1Char('@') // allow @ to find invalid email addresses
41 || allowedSpecialChars.indexOf(mText.at(start)) != -1)) {
42 if (mText.at(start) == QLatin1Char('@')) {
43 return QString(); // local part contains '@' -> no email address
44 }
45 --start;
46 }
47 ++start;
48 // we assume that an email address starts with a letter or a digit
49 while ((start < mPos) && !mText.at(start).isLetterOrNumber()) {
50 ++start;
51 }
52 if (start == mPos) {
53 return QString(); // local part is empty -> no email address
54 }
55
56 // determine the domain part of the email address
57 int dotPos = INT_MAX;
58 int end = mPos + 1;
59 while (end < mText.length()
60 && (mText.at(end).isLetterOrNumber() //
61 || mText.at(end) == QLatin1Char('@') // allow @ to find invalid email addresses
62 || mText.at(end) == QLatin1Char('.') //
63 || mText.at(end) == QLatin1Char('-'))) {
64 if (mText.at(end) == QLatin1Char('@')) {
65 return QString(); // domain part contains '@' -> no email address
66 }
67 if (mText.at(end) == QLatin1Char('.')) {
68 dotPos = qMin(dotPos, end); // remember index of first dot in domain
69 }
70 ++end;
71 }
72 // we assume that an email address ends with a letter or a digit
73 while ((end > mPos) && !mText.at(end - 1).isLetterOrNumber()) {
74 --end;
75 }
76 if (end == mPos) {
77 return QString(); // domain part is empty -> no email address
78 }
79 if (dotPos >= end) {
80 return QString(); // domain part doesn't contain a dot
81 }
82
83 if (end - start > mMaxAddressLen) {
84 return QString(); // too long -> most likely no email address
85 }
86 address = mText.mid(start, end - start);
87
88 mPos = end - 1;
89 }
90 return address;
91}
92
93QString KTextToHTMLHelper::getPhoneNumber()
94{
95 if (!mText.at(mPos).isDigit() && mText.at(mPos) != QLatin1Char('+')) {
96 return {};
97 }
98
99 const QString allowedBeginSeparators = QStringLiteral(" \r\t\n:");
100 if (mPos > 0 && !allowedBeginSeparators.contains(mText.at(mPos - 1))) {
101 return {};
102 }
103
104 // this isn't 100% accurate, we filter stuff below that is too hard to capture with a regexp
105 static const QRegularExpression telPattern(QStringLiteral(R"([+0](( |( ?[/-] ?)?)\‍(?\d+\)?+){6,30})"));
107 if (match.hasMatch()) {
108 QStringView matchedText = match.capturedView();
109 // check for maximum number of digits (15), see https://en.wikipedia.org/wiki/Telephone_numbering_plan
110 const int digitsCount = std::count_if(matchedText.cbegin(), matchedText.cend(), [](const QChar c) {
111 return c.isDigit();
112 });
113
114 if (digitsCount > 15) {
115 return {};
116 }
117
118 // only one / is allowed, otherwise we trigger on dates
119 if (matchedText.count(QLatin1Char('/')) > 1) {
120 return {};
121 }
122
123 // parenthesis need to be balanced, and must not be nested
124 int openIdx = -1;
125 for (int i = 0, size = matchedText.size(); i < size; ++i) {
126 const QChar ch = matchedText.at(i);
127 if ((ch == QLatin1Char('(') && openIdx >= 0) || (ch == QLatin1Char(')') && openIdx < 0)) {
128 return {};
129 }
130
131 if (ch == QLatin1Char('(')) {
132 openIdx = i;
133 } else if (ch == QLatin1Char(')')) {
134 openIdx = -1;
135 }
136 }
137
138 if (openIdx > 0) {
139 matchedText.truncate(openIdx - 1);
140 matchedText = matchedText.trimmed();
141 }
142
143 // check if there's a plausible separator at the end
144 const int matchedTextLength = matchedText.size();
145 const int endIdx = mPos + matchedTextLength;
146 if (endIdx < mText.size() && !QStringView(u" \r\t\n,.").contains(mText.at(endIdx))) {
147 return {};
148 }
149
150 mPos += matchedTextLength - 1;
151 return matchedText.toString();
152 }
153 return {};
154}
155
156static QString normalizePhoneNumber(const QString &str)
157{
158 QString res;
159 res.reserve(str.size());
160 for (const auto c : str) {
161 if (c.isDigit() || c == QLatin1Char('+')) {
162 res.push_back(c);
163 }
164 }
165 return res;
166}
167
168// The following characters are allowed in a dot-atom (RFC 2822):
169// a-z A-Z 0-9 . ! # $ % & ' * + - / = ? ^ _ ` { | } ~
170static const char s_allowedSpecialChars[] = ".!#$%&'*+-/=?^_`{|}~";
171
172bool KTextToHTMLHelper::atUrl() const
173{
174 // The character directly before the URL must not be a letter, a number or
175 // any other character allowed in a dot-atom (RFC 2822).
176 if (mPos > 0) {
177 const auto chBefore = mText.at(mPos - 1);
178 if (chBefore.isLetterOrNumber() || QLatin1String(s_allowedSpecialChars).contains(chBefore)) {
179 return false;
180 }
181 }
182
183 const auto segment = QStringView(mText).mid(mPos);
184 /* clang-format off */
185 return segment.startsWith(QLatin1String("http://"))
186 || segment.startsWith(QLatin1String("https://"))
187 || segment.startsWith(QLatin1String("vnc://"))
188 || segment.startsWith(QLatin1String("fish://"))
189 || segment.startsWith(QLatin1String("ftp://"))
190 || segment.startsWith(QLatin1String("ftps://"))
191 || segment.startsWith(QLatin1String("sftp://"))
192 || segment.startsWith(QLatin1String("smb://"))
193 || segment.startsWith(QLatin1String("irc://"))
194 || segment.startsWith(QLatin1String("ircs://"))
195 || segment.startsWith(QLatin1String("mailto:"))
196 || segment.startsWith(QLatin1String("www."))
197 || segment.startsWith(QLatin1String("ftp."))
198 || segment.startsWith(QLatin1String("file://"))
199 || segment.startsWith(QLatin1String("news:"))
200 || segment.startsWith(QLatin1String("tel:"))
201 || segment.startsWith(QLatin1String("xmpp:"));
202 /* clang-format on */
203}
204
205bool KTextToHTMLHelper::isEmptyUrl(const QString &url) const
206{
207 /* clang-format off */
208 return url.isEmpty()
209 || url == QLatin1String("http://")
210 || url == QLatin1String("https://")
211 || url == QLatin1String("fish://")
212 || url == QLatin1String("ftp://")
213 || url == QLatin1String("ftps://")
214 || url == QLatin1String("sftp://")
215 || url == QLatin1String("smb://")
216 || url == QLatin1String("vnc://")
217 || url == QLatin1String("irc://")
218 || url == QLatin1String("ircs://")
219 || url == QLatin1String("mailto")
220 || url == QLatin1String("mailto:")
221 || url == QLatin1String("www")
222 || url == QLatin1String("ftp")
223 || url == QLatin1String("news:")
224 || url == QLatin1String("news://")
225 || url == QLatin1String("tel")
226 || url == QLatin1String("tel:")
227 || url == QLatin1String("xmpp:");
228 /* clang-format on */
229}
230
231QString KTextToHTMLHelper::getUrl(bool *badurl)
232{
233 QString url;
234 if (atUrl()) {
235 // NOTE: see http://tools.ietf.org/html/rfc3986#appendix-A and especially appendix-C
236 // Appendix-C mainly says, that when extracting URLs from plain text, line breaks shall
237 // be allowed and should be ignored when the URI is extracted.
238
239 // This implementation follows this recommendation and
240 // allows the URL to be enclosed within different kind of brackets/quotes
241 // If an URL is enclosed, whitespace characters are allowed and removed, otherwise
242 // the URL ends with the first whitespace
243 // Also, if the URL is enclosed in brackets, the URL itself is not allowed
244 // to contain the closing bracket, as this would be detected as the end of the URL
245
246 QChar beforeUrl;
247 QChar afterUrl;
248
249 // detect if the url has been surrounded by brackets or quotes
250 if (mPos > 0) {
251 beforeUrl = mText.at(mPos - 1);
252
253 /*if ( beforeUrl == '(' ) {
254 afterUrl = ')';
255 } else */
256 if (beforeUrl == QLatin1Char('[')) {
257 afterUrl = QLatin1Char(']');
258 } else if (beforeUrl == QLatin1Char('<')) {
259 afterUrl = QLatin1Char('>');
260 } else if (beforeUrl == QLatin1Char('>')) { // for e.g. <link>http://.....</link>
261 afterUrl = QLatin1Char('<');
262 } else if (beforeUrl == QLatin1Char('"')) {
263 afterUrl = QLatin1Char('"');
264 }
265 }
266 url.reserve(mMaxUrlLen); // avoid allocs
267 int start = mPos;
268 bool previousCharIsSpace = false;
269 bool previousCharIsADoubleQuote = false;
270 bool previousIsAnAnchor = false;
271 /* clang-format off */
272 while (mPos < mText.length() //
273 && (mText.at(mPos).isPrint() || mText.at(mPos).isSpace())
274 && ((afterUrl.isNull() && !mText.at(mPos).isSpace())
275 || (!afterUrl.isNull() && mText.at(mPos) != afterUrl))) {
276 if (!previousCharIsSpace
277 && mText.at(mPos) == QLatin1Char('<')
278 && (mPos + 1) < mText.length()) { /* clang-format on */
279 // Fix Bug #346132: allow "http://www.foo.bar<http://foo.bar/>"
280 // < inside a URL is not allowed, however there is a test which
281 // checks that "http://some<Host>/path" should be allowed
282 // Therefore: check if what follows is another URL and if so, stop here
283 mPos++;
284 if (atUrl()) {
285 mPos--;
286 break;
287 }
288 mPos--;
289 }
290 if (!previousCharIsSpace && (mText.at(mPos) == QLatin1Char(' ')) && ((mPos + 1) < mText.length())) {
291 // Fix kmail bug: allow "http://www.foo.bar http://foo.bar/"
292 // Therefore: check if what follows is another URL and if so, stop here
293 mPos++;
294 if (atUrl()) {
295 mPos--;
296 break;
297 }
298 mPos--;
299 }
300 if (mText.at(mPos).isSpace()) {
301 previousCharIsSpace = true;
302 } else if (!previousIsAnAnchor && mText.at(mPos) == QLatin1Char('[')) {
303 break;
304 } else if (!previousIsAnAnchor && mText.at(mPos) == QLatin1Char(']')) {
305 break;
306 } else { // skip whitespace
307 if (previousCharIsSpace && mText.at(mPos) == QLatin1Char('<')) {
308 url.append(QLatin1Char(' '));
309 break;
310 }
311 previousCharIsSpace = false;
312 if (mText.at(mPos) == QLatin1Char('>') && previousCharIsADoubleQuote) {
313 // it's an invalid url
314 if (badurl) {
315 *badurl = true;
316 }
317 return QString();
318 }
319 if (mText.at(mPos) == QLatin1Char('"')) {
320 previousCharIsADoubleQuote = true;
321 } else {
322 previousCharIsADoubleQuote = false;
323 }
324 if (mText.at(mPos) == QLatin1Char('#')) {
325 previousIsAnAnchor = true;
326 }
327 url.append(mText.at(mPos));
328 if (url.length() > mMaxUrlLen) {
329 break;
330 }
331 }
332
333 ++mPos;
334 }
335
336 if (isEmptyUrl(url) || (url.length() > mMaxUrlLen)) {
337 mPos = start;
338 url.clear();
339 return url;
340 } else {
341 --mPos;
342 }
343 }
344
345 // HACK: This is actually against the RFC. However, most people don't properly escape the URL in
346 // their text with "" or <>. That leads to people writing an url, followed immediately by
347 // a dot to finish the sentence. That would lead the parser to include the dot in the url,
348 // even though that is not wanted. So work around that here.
349 // Most real-life URLs hopefully don't end with dots or commas.
350 QString wordBoundaries = QStringLiteral(".,:!?>");
351 bool hasOpenParenthese = url.contains(QLatin1Char('('));
352 if (!hasOpenParenthese) {
353 wordBoundaries += QLatin1Char(')');
354 }
355
356 if (url.length() > 1) {
357 do {
358 const QChar charact{url.at(url.length() - 1)};
359 if (wordBoundaries.contains(charact)) {
360 url.chop(1);
361 --mPos;
362 } else if (hasOpenParenthese && (charact == QLatin1Char(')'))) {
363 if (url.length() > 2) {
364 if (url.at(url.length() - 2) == QLatin1Char(')')) {
365 url.chop(1);
366 --mPos;
367 hasOpenParenthese = false;
368 } else {
369 break;
370 }
371 } else {
372 break;
373 }
374 } else {
375 break;
376 }
377 } while (url.length() > 1);
378 }
379 return url;
380}
381
382QString KTextToHTMLHelper::highlightedText()
383{
384 // formating symbols must be prepended with a whitespace
385 if ((mPos > 0) && !mText.at(mPos - 1).isSpace()) {
386 return QString();
387 }
388
389 const QChar ch = mText.at(mPos);
390 if (ch != QLatin1Char('/') && ch != QLatin1Char('*') && ch != QLatin1Char('_') && ch != QLatin1Char('-')) {
391 return QString();
392 }
393
394 const QRegularExpression re(QStringLiteral("\\%1([^\\s|^\\%1].*[^\\s|^\\%1])\\%1").arg(ch), QRegularExpression::InvertedGreedinessOption);
395 const auto match =
396 re.match(mText, mPos, QRegularExpression::NormalMatch, QRegularExpression::AnchorAtOffsetMatchOption); // clazy:exclude=use-static-qregularexpression
397
398 if (match.hasMatch()) {
399 if (match.capturedStart() == mPos) {
400 int length = match.capturedLength();
401 // there must be a whitespace after the closing formating symbol
402 if (mPos + length < mText.length() && !mText.at(mPos + length).isSpace()) {
403 return QString();
404 }
405 mPos += length - 1;
406 switch (ch.toLatin1()) {
407 case '*':
408 return QLatin1String("<b>*") + match.capturedView(1) + QLatin1String("*</b>");
409 case '_':
410 return QLatin1String("<u>_") + match.capturedView(1) + QLatin1String("_</u>");
411 case '/':
412 return QLatin1String("<i>/") + match.capturedView(1) + QLatin1String("/</i>");
413 case '-':
414 return QLatin1String("<s>-") + match.capturedView(1) + QLatin1String("-</s>");
415 }
416 }
417 }
418 return QString();
419}
420
421QString KTextToHTML::convertToHtml(const QString &plainText, const KTextToHTML::Options &flags, int maxUrlLen, int maxAddressLen)
422{
423 KTextToHTMLHelper helper(plainText, 0, maxUrlLen, maxAddressLen);
424
425 QString str;
426 QString result(static_cast<QChar *>(nullptr), helper.mText.length() * 2);
427 QChar ch;
428 int x;
429 bool startOfLine = true;
430
431 for (helper.mPos = 0, x = 0; helper.mPos < helper.mText.length(); ++helper.mPos, ++x) {
432 ch = helper.mText.at(helper.mPos);
433 if (flags & PreserveSpaces) {
434 if (ch == QLatin1Char(' ')) {
435 if (helper.mPos + 1 < helper.mText.length()) {
436 if (helper.mText.at(helper.mPos + 1) != QLatin1Char(' ')) {
437 // A single space, make it breaking if not at the start or end of the line
438 const bool endOfLine = helper.mText.at(helper.mPos + 1) == QLatin1Char('\n');
439 if (!startOfLine && !endOfLine) {
440 result += QLatin1Char(' ');
441 } else {
442 result += QLatin1String("&nbsp;");
443 }
444 } else {
445 // Whitespace of more than one space, make it all non-breaking
446 while (helper.mPos < helper.mText.length() && helper.mText.at(helper.mPos) == QLatin1Char(' ')) {
447 result += QLatin1String("&nbsp;");
448 ++helper.mPos;
449 ++x;
450 }
451
452 // We incremented once to often, undo that
453 --helper.mPos;
454 --x;
455 }
456 } else {
457 // Last space in the text, it is non-breaking
458 result += QLatin1String("&nbsp;");
459 }
460
461 if (startOfLine) {
462 startOfLine = false;
463 }
464 continue;
465 } else if (ch == QLatin1Char('\t')) {
466 do {
467 result += QLatin1String("&nbsp;");
468 ++x;
469 } while ((x & 7) != 0);
470 --x;
471 startOfLine = false;
472 continue;
473 }
474 }
475 if (ch == QLatin1Char('\n')) {
476 result += QLatin1String("<br />\n"); // Keep the \n, so apps can figure out the quoting levels correctly.
477 startOfLine = true;
478 x = -1;
479 continue;
480 }
481
482 startOfLine = false;
483 if (ch == QLatin1Char('&')) {
484 result += QLatin1String("&amp;");
485 } else if (ch == QLatin1Char('"')) {
486 result += QLatin1String("&quot;");
487 } else if (ch == QLatin1Char('<')) {
488 result += QLatin1String("&lt;");
489 } else if (ch == QLatin1Char('>')) {
490 result += QLatin1String("&gt;");
491 } else {
492 const int start = helper.mPos;
493 if (!(flags & IgnoreUrls)) {
494 bool badUrl = false;
495 str = helper.getUrl(&badUrl);
496 if (badUrl) {
497 QString resultBadUrl;
498 for (const QChar chBadUrl : std::as_const(helper.mText)) {
499 if (chBadUrl == QLatin1Char('&')) {
500 resultBadUrl += QLatin1String("&amp;");
501 } else if (chBadUrl == QLatin1Char('"')) {
502 resultBadUrl += QLatin1String("&quot;");
503 } else if (chBadUrl == QLatin1Char('<')) {
504 resultBadUrl += QLatin1String("&lt;");
505 } else if (chBadUrl == QLatin1Char('>')) {
506 resultBadUrl += QLatin1String("&gt;");
507 } else {
508 resultBadUrl += chBadUrl;
509 }
510 }
511 return resultBadUrl;
512 }
513 if (!str.isEmpty()) {
514 QString hyperlink;
515 if (str.startsWith(QLatin1String("www."))) {
516 hyperlink = QLatin1String("http://") + str;
517 } else if (str.startsWith(QLatin1String("ftp."))) {
518 hyperlink = QLatin1String("ftp://") + str;
519 } else {
520 hyperlink = str;
521 }
522 result += QLatin1String("<a href=\"") + hyperlink + QLatin1String("\">") + str.toHtmlEscaped() + QLatin1String("</a>");
523 x += helper.mPos - start;
524 continue;
525 }
526 str = helper.getEmailAddress();
527 if (!str.isEmpty()) {
528 // len is the length of the local part
529 int len = str.indexOf(QLatin1Char('@'));
530 QString localPart = str.left(len);
531
532 // remove the local part from the result (as '&'s have been expanded to
533 // &amp; we have to take care of the 4 additional characters per '&')
534 result.truncate(result.length() - len - (localPart.count(QLatin1Char('&')) * 4));
535 x -= len;
536
537 result += QLatin1String("<a href=\"mailto:") + str + QLatin1String("\">") + str + QLatin1String("</a>");
538 x += str.length() - 1;
539 continue;
540 }
541 if (flags & ConvertPhoneNumbers) {
542 str = helper.getPhoneNumber();
543 if (!str.isEmpty()) {
544 result += QLatin1String("<a href=\"tel:") + normalizePhoneNumber(str) + QLatin1String("\">") + str + QLatin1String("</a>");
545 x += str.length() - 1;
546 continue;
547 }
548 }
549 }
550 if (flags & HighlightText) {
551 str = helper.highlightedText();
552 if (!str.isEmpty()) {
553 result += str;
554 x += helper.mPos - start;
555 continue;
556 }
557 }
558 result += ch;
559 }
560 }
561
562 if (flags & ReplaceSmileys) {
563 result = KEmoticonsParser::parseEmoticons(result);
564 }
565
566 return result;
567}
Q_SCRIPTABLE Q_NOREPLY void start()
KCOREADDONS_EXPORT Result match(QStringView pattern, QStringView str)
This is the main function which does scored fuzzy matching.
PostalAddress address(const QVariant &location)
const QList< QKeySequence > & end()
KCOREADDONS_EXPORT QString convertToHtml(const QString &plainText, const KTextToHTML::Options &options, int maxUrlLen=4096, int maxAddressLen=255)
Converts plaintext into html.
@ ReplaceSmileys
Replace text emoticons smileys by emoticons images.
Definition ktexttohtml.h:37
@ IgnoreUrls
Don't parse and replace any URLs.
Definition ktexttohtml.h:42
@ ConvertPhoneNumbers
Replace phone numbers with tel: links.
Definition ktexttohtml.h:54
@ HighlightText
Interpret text highlighting markup, like bold, underline and /italic/, and wrap them in corresponding...
Definition ktexttohtml.h:48
@ PreserveSpaces
Preserve white-space formatting of the text.
Definition ktexttohtml.h:28
bool isNull() const const
char toLatin1() const const
qsizetype count() const const
QString & append(QChar ch)
const QChar at(qsizetype position) const const
void chop(qsizetype n)
void clear()
bool contains(QChar ch, Qt::CaseSensitivity cs) const const
qsizetype indexOf(QChar ch, qsizetype from, Qt::CaseSensitivity cs) const const
bool isEmpty() const const
QString left(qsizetype n) const const
qsizetype length() const const
void push_back(QChar ch)
void reserve(qsizetype size)
qsizetype size() const const
bool startsWith(QChar c, Qt::CaseSensitivity cs) const const
QString toHtmlEscaped() const const
void truncate(qsizetype position)
QStringView mid(qsizetype start, qsizetype length) const const
QChar at(qsizetype n) const const
const_iterator cbegin() const const
const_iterator cend() const const
bool contains(QChar c, Qt::CaseSensitivity cs) const const
qsizetype count(QChar ch, Qt::CaseSensitivity cs) const const
qsizetype size() const const
bool startsWith(QChar ch) const const
QString toString() const const
QStringView trimmed() const const
void truncate(qsizetype length)
This file is part of the KDE documentation.
Documentation copyright © 1996-2024 The KDE developers.
Generated on Tue Mar 26 2024 11:13:31 by doxygen 1.10.0 written by Dimitri van Heesch, © 1997-2006

KDE's Doxygen guidelines are available online.