Sonnet

guesslanguage.cpp
1 /* This file is part of the KDE libraries
2  SPDX-FileCopyrightText: 2006 Jacob R Rideout <[email protected]>
3  SPDX-FileCopyrightText: 2009 Jakub Stachowski <[email protected]>
4  SPDX-FileCopyrightText: 2013 Martin Sandsmark <[email protected]>
5 
6  SPDX-License-Identifier: LGPL-2.0-or-later
7 */
8 
9 #include <QCoreApplication>
10 #include <QDataStream>
11 #include <QFile>
12 #include <QLocale>
13 #include <QStandardPaths>
14 
15 #include "core_debug.h"
16 #include "guesslanguage.h"
17 #include "loader_p.h"
18 #include "speller.h"
19 #include "spellerplugin_p.h"
20 #include "tokenizer_p.h"
21 
22 /*
23 All language tags should be valid according to IETF BCP 47, as codified in RFC 4646.
24 ISO 639-1 codes should be used for the language part except for cases where there
25 exists no code, then 639-3 codes should be used. Country codes should only be used
26 in special cases. Scripts can be differentiated by IANA subtags, available here:
27 http://www.iana.org/assignments/language-subtag-registry
28 The script tags correspond to ISO 15924
29 
30 An overview of the best practices concerning language tagging is available here:
31 http://www.w3.org/International/articles/language-tags/Overview.en.php
32 
33 lang tags should use underscores (_) rather than hyphens (-) to separate subsections.
34 
35 EXCEPTIONS:
36 For cases of known differences from the above tagging scheme and major
37 spellcheckers such aspell/hunspell/myspell, the scheme used by the spell checkers
38 shall be used. All exception shall be noted here:
39 
40 BCP SPELLCHECK
41 az-Latn az
42 
43 */
44 
45 namespace Sonnet
46 {
47 class GuessLanguagePrivate
48 {
49 public:
50  GuessLanguagePrivate();
51  // language trigram score
52  static QHash<QString, QHash<QString, int>> s_knownModels;
53 
54  void loadModels();
55  QList<QChar::Script> findRuns(const QString &text);
56  QVector<QString> createOrderedModel(const QString &content);
57  int distance(const QVector<QString> &model, const QHash<QString, int> &knownModel);
58  QStringList guessFromTrigrams(const QString &sample, const QStringList &langs);
59  QStringList identify(const QString &sample, const QList<QChar::Script> &scripts);
60  QString guessFromDictionaries(const QString &sentence, const QStringList &candidates);
61 
62  static QSet<QString> s_knownDictionaries;
63  static QMultiHash<QChar::Script, QString> s_scriptLanguages;
64  static QMap<QString, QString> s_dictionaryNameMap;
65 
66  const int MIN_LENGTH;
67  int m_maxItems;
68  double m_minConfidence;
69 };
70 
71 QHash<QString, QHash<QString, int>> GuessLanguagePrivate::s_knownModels;
72 QSet<QString> GuessLanguagePrivate::s_knownDictionaries;
73 QMultiHash<QChar::Script, QString> GuessLanguagePrivate::s_scriptLanguages;
74 QMap<QString, QString> GuessLanguagePrivate::s_dictionaryNameMap;
75 
76 QStringList getNames(QLocale::Script script)
77 {
78  QStringList locales;
79  const auto matchingLocales = QLocale::matchingLocales(QLocale::AnyLanguage, script, QLocale::AnyCountry);
80  locales.reserve(matchingLocales.size());
81  for (const QLocale &locale : matchingLocales) {
82  locales << locale.name();
83  }
84  return locales;
85 }
86 
87 GuessLanguagePrivate::GuessLanguagePrivate()
88  : MIN_LENGTH(5)
89  , m_maxItems(1)
90  , m_minConfidence(0)
91 {
92  if (!s_scriptLanguages.isEmpty()) {
93  return;
94  }
95 
96  const QStringList languages = Loader::openLoader()->languages();
97  s_knownDictionaries = QSet<QString>(languages.begin(), languages.end());
98  QSet<QString> dictionaryLanguages;
99  for (const QString &dictName : std::as_const(s_knownDictionaries)) {
100  QString languageName = QLocale(dictName).name();
101  if (languageName.isEmpty()) {
102  qCWarning(SONNET_LOG_CORE) << "Unable to parse name for dictionary" << dictName;
103  continue;
104  }
105  dictionaryLanguages.insert(languageName);
106  }
107 
108  QSet<QString> allLanguages;
109  for (int i = 0; i < int(QChar::ScriptCount); i++) {
110  QChar::Script script = static_cast<QChar::Script>(i);
111  QStringList names;
112  switch (script) {
113  case QChar::Script_Latin:
114  names = getNames(QLocale::LatinScript);
115  break;
116  case QChar::Script_Greek:
117  names = getNames(QLocale::GreekScript);
118  break;
120  names = getNames(QLocale::CyrillicScript);
121  break;
123  names = getNames(QLocale::ArmenianScript);
124  break;
126  names = getNames(QLocale::HebrewScript);
127  break;
129  names = getNames(QLocale::ArabicScript);
130  break;
132  names = getNames(QLocale::SyriacScript);
133  break;
135  names = getNames(QLocale::ThaanaScript);
136  break;
138  names = getNames(QLocale::DevanagariScript);
139  break;
141  names = getNames(QLocale::BengaliScript);
142  break;
144  names = getNames(QLocale::GurmukhiScript);
145  break;
147  names = getNames(QLocale::GujaratiScript);
148  break;
149  case QChar::Script_Oriya:
150  names = getNames(QLocale::OriyaScript);
151  break;
152  case QChar::Script_Tamil:
153  names = getNames(QLocale::TamilScript);
154  break;
156  names = getNames(QLocale::TeluguScript);
157  break;
159  names = getNames(QLocale::KannadaScript);
160  break;
162  names = getNames(QLocale::MalayalamScript);
163  break;
165  names = getNames(QLocale::SinhalaScript);
166  break;
167  case QChar::Script_Thai:
168  names = getNames(QLocale::ThaiScript);
169  break;
170  case QChar::Script_Lao:
171  names = getNames(QLocale::LaoScript);
172  break;
174  names = getNames(QLocale::TibetanScript);
175  break;
177  names = getNames(QLocale::MyanmarScript);
178  break;
180  names = getNames(QLocale::GeorgianScript);
181  break;
183  names = getNames(QLocale::HangulScript);
184  break;
186  names = getNames(QLocale::EthiopicScript);
187  break;
189  names = getNames(QLocale::CherokeeScript);
190  break;
192  names = getNames(QLocale::CanadianAboriginalScript);
193  break;
194  case QChar::Script_Ogham:
195  names = getNames(QLocale::OghamScript);
196  break;
197  case QChar::Script_Runic:
198  names = getNames(QLocale::RunicScript);
199  break;
200  case QChar::Script_Khmer:
201  names = getNames(QLocale::KhmerScript);
202  break;
204  names = getNames(QLocale::MongolianScript);
205  break;
207  names = getNames(QLocale::HiraganaScript);
208  break;
210  names = getNames(QLocale::KatakanaScript);
211  break;
213  names = getNames(QLocale::BopomofoScript);
214  break;
215  case QChar::Script_Han:
216  names = getNames(QLocale::HanScript);
217  break;
218  case QChar::Script_Yi:
219  names = getNames(QLocale::YiScript);
220  break;
222  names = getNames(QLocale::OldItalicScript);
223  break;
225  names = getNames(QLocale::GothicScript);
226  break;
228  names = getNames(QLocale::DeseretScript);
229  break;
231  names = getNames(QLocale::TagalogScript);
232  break;
234  names = getNames(QLocale::HanunooScript);
235  break;
236  case QChar::Script_Buhid:
237  names = getNames(QLocale::BuhidScript);
238  break;
240  names = getNames(QLocale::TagbanwaScript);
241  break;
243  names = getNames(QLocale::CopticScript);
244  break;
245  case QChar::Script_Limbu:
246  names = getNames(QLocale::LimbuScript);
247  break;
248  case QChar::Script_TaiLe:
249  names = getNames(QLocale::TaiLeScript);
250  break;
252  names = getNames(QLocale::LinearBScript);
253  break;
255  names = getNames(QLocale::UgariticScript);
256  break;
258  names = getNames(QLocale::ShavianScript);
259  break;
261  names = getNames(QLocale::OsmanyaScript);
262  break;
264  names = getNames(QLocale::CypriotScript);
265  break;
267  names = getNames(QLocale::BrailleScript);
268  break;
270  names = getNames(QLocale::BugineseScript);
271  break;
273  names = getNames(QLocale::NewTaiLueScript);
274  break;
276  names = getNames(QLocale::GlagoliticScript);
277  break;
279  names = getNames(QLocale::TifinaghScript);
280  break;
282  names = getNames(QLocale::SylotiNagriScript);
283  break;
285  names = getNames(QLocale::OldPersianScript);
286  break;
288  names = getNames(QLocale::KharoshthiScript);
289  break;
291  names = getNames(QLocale::BalineseScript);
292  break;
294  names = getNames(QLocale::CuneiformScript);
295  break;
297  names = getNames(QLocale::PhoenicianScript);
298  break;
300  names = getNames(QLocale::PhagsPaScript);
301  break;
302  case QChar::Script_Nko:
303  names = getNames(QLocale::NkoScript);
304  break;
306  names = getNames(QLocale::SundaneseScript);
307  break;
309  names = getNames(QLocale::LepchaScript);
310  break;
312  names = getNames(QLocale::OlChikiScript);
313  break;
314  case QChar::Script_Vai:
315  names = getNames(QLocale::VaiScript);
316  break;
318  names = getNames(QLocale::SaurashtraScript);
319  break;
321  names = getNames(QLocale::KayahLiScript);
322  break;
324  names = getNames(QLocale::RejangScript);
325  break;
327  names = getNames(QLocale::LycianScript);
328  break;
330  names = getNames(QLocale::CarianScript);
331  break;
333  names = getNames(QLocale::LydianScript);
334  break;
335  case QChar::Script_Cham:
336  names = getNames(QLocale::ChamScript);
337  break;
339  names = getNames(QLocale::LannaScript);
340  break;
342  names = getNames(QLocale::TaiVietScript);
343  break;
345  names = getNames(QLocale::AvestanScript);
346  break;
348  names = getNames(QLocale::EgyptianHieroglyphsScript);
349  break;
351  names = getNames(QLocale::SamaritanScript);
352  break;
353  case QChar::Script_Lisu:
354  names = getNames(QLocale::FraserScript);
355  break;
356  case QChar::Script_Bamum:
357  names = getNames(QLocale::BamumScript);
358  break;
360  names = getNames(QLocale::JavaneseScript);
361  break;
363  names = getNames(QLocale::MeiteiMayekScript);
364  break;
366  names = getNames(QLocale::ImperialAramaicScript);
367  break;
369  names = getNames(QLocale::OldSouthArabianScript);
370  break;
372  names = getNames(QLocale::InscriptionalParthianScript);
373  break;
375  names = getNames(QLocale::InscriptionalPahlaviScript);
376  break;
378  names = getNames(QLocale::KaithiScript);
379  break;
380  case QChar::Script_Batak:
381  names = getNames(QLocale::BatakScript);
382  break;
384  names = getNames(QLocale::BrahmiScript);
385  break;
387  names = getNames(QLocale::MandaeanScript);
388  break;
390  names = getNames(QLocale::ChakmaScript);
391  break;
394  names = getNames(QLocale::MeroiticCursiveScript);
395  names.append(getNames(QLocale::MeroiticScript));
396  break;
397  case QChar::Script_Miao:
398  names = getNames(QLocale::PollardPhoneticScript);
399  break;
401  names = getNames(QLocale::SharadaScript);
402  break;
404  names = getNames(QLocale::SoraSompengScript);
405  break;
406  case QChar::Script_Takri:
407  names = getNames(QLocale::TakriScript);
408  break;
410  names = getNames(QLocale::CaucasianAlbanianScript);
411  break;
413  names = getNames(QLocale::BassaVahScript);
414  break;
416  names = getNames(QLocale::DuployanScript);
417  break;
419  names = getNames(QLocale::ElbasanScript);
420  break;
422  names = getNames(QLocale::GranthaScript);
423  break;
425  names = getNames(QLocale::PahawhHmongScript);
426  break;
428  names = getNames(QLocale::KhojkiScript);
429  break;
431  names = getNames(QLocale::LinearAScript);
432  break;
434  names = getNames(QLocale::MahajaniScript);
435  break;
437  names = getNames(QLocale::ManichaeanScript);
438  break;
440  names = getNames(QLocale::MendeKikakuiScript);
441  break;
442  case QChar::Script_Modi:
443  names = getNames(QLocale::ModiScript);
444  break;
445  case QChar::Script_Mro:
446  names = getNames(QLocale::MroScript);
447  break;
449  names = getNames(QLocale::OldNorthArabianScript);
450  break;
452  names = getNames(QLocale::NabataeanScript);
453  break;
455  names = getNames(QLocale::PalmyreneScript);
456  break;
458  names = getNames(QLocale::PauCinHauScript);
459  break;
461  names = getNames(QLocale::OldPermicScript);
462  break;
464  names = getNames(QLocale::PsalterPahlaviScript);
465  break;
467  names = getNames(QLocale::SiddhamScript);
468  break;
470  names = getNames(QLocale::KhudawadiScript);
471  break;
473  names = getNames(QLocale::TirhutaScript);
474  break;
476  names = getNames(QLocale::VarangKshitiScript);
477  break;
478  case QChar::Script_Ahom:
479  names = getNames(QLocale::AhomScript);
480  break;
482  names = getNames(QLocale::AnatolianHieroglyphsScript);
483  break;
485  names = getNames(QLocale::HatranScript);
486  break;
488  names = getNames(QLocale::MultaniScript);
489  break;
491  names = getNames(QLocale::OldHungarianScript);
492  break;
498  break;
499  default:
500  qCDebug(SONNET_LOG_CORE) << "Unhandled script" << script;
501  break;
502  }
503  allLanguages.unite(QSet<QString>(names.constBegin(), names.constEnd()));
504 
505  { // Remove unknown languages
506  QStringList pruned;
507  for (const QString &name : std::as_const(names)) {
508  if (!dictionaryLanguages.contains(name)) {
509  continue;
510  }
511  pruned.append(name);
512  }
513  names = pruned;
514  }
515 
516  if (names.isEmpty()) {
517  continue;
518  }
519 
520  for (const QString &name : std::as_const(names)) {
521  s_scriptLanguages.insert(script, name);
522  }
523  }
524 
525  // Try to handle some badly named dictionaries
526  if (!allLanguages.contains(s_knownDictionaries)) {
527  QSet<QString> dicts(s_knownDictionaries);
528  dicts.subtract(allLanguages);
529  for (const QString &dictName : std::as_const(dicts)) {
530  QString languageName = QLocale(dictName).name();
531  if (languageName.isEmpty()) {
532  qCWarning(SONNET_LOG_CORE) << "Unable to parse language name" << dictName;
533  continue;
534  }
535  s_dictionaryNameMap[languageName] = dictName;
536  if (std::find(s_scriptLanguages.cbegin(), s_scriptLanguages.cend(), languageName) == s_scriptLanguages.cend()) {
537  qCWarning(SONNET_LOG_CORE) << "Unable to handle language from dictionary" << dictName << languageName;
538  }
539  }
540  }
541 }
542 
544  : d(new GuessLanguagePrivate)
545 {
546 }
547 
549 {
550  delete d;
551 }
552 
553 QString GuessLanguage::identify(const QString &text, const QStringList &suggestionsListIn) const
554 {
555  if (text.isEmpty()) {
556  return QString();
557  }
558 
559  // Filter for available dictionaries
560  QStringList suggestionsList;
561  for (const QString &suggestion : suggestionsListIn) {
562  if (d->s_knownDictionaries.contains(suggestion) && !suggestionsList.contains(suggestion)) {
563  suggestionsList.append(suggestion);
564  }
565  }
566 
567  // Load the model on demand
568  if (d->s_knownModels.isEmpty()) {
569  d->loadModels();
570  }
571 
572  const QList<QChar::Script> scriptsList = d->findRuns(text);
573 
574  QStringList candidateLanguages = d->identify(text, scriptsList);
575 
576  // if guessing from trigrams fail
577  for (const QChar::Script script : scriptsList) {
578  const auto languagesList = d->s_scriptLanguages.values(script);
579  for (const QString &lang : languagesList) {
580  if (!d->s_knownModels.contains(lang)) {
581  candidateLanguages.append(lang);
582  }
583  }
584  }
585 
586  // Hack for some bad dictionary names
587  for (int i = 0; i < candidateLanguages.count(); i++) {
588  if (d->s_dictionaryNameMap.contains(candidateLanguages[i])) {
589  candidateLanguages[i] = d->s_dictionaryNameMap.value(candidateLanguages[i]);
590  }
591  }
592 
593  if (candidateLanguages.count() == 1) {
594  return candidateLanguages.first();
595  }
596 
597  // Wasn't able to get a good guess with the trigrams, try checking all
598  // dictionaries for the suggested languages.
599  candidateLanguages.append(suggestionsList);
600  candidateLanguages.removeDuplicates();
601  QString identified = d->guessFromDictionaries(text, candidateLanguages);
602  if (!identified.isEmpty()) {
603  return identified;
604  }
605 
606  qCDebug(SONNET_LOG_CORE()) << "Unable to identify string with dictionaries:" << text;
607 
608  // None of our methods worked, just return the best suggestion
609  if (!suggestionsList.isEmpty()) {
610  return suggestionsList.first();
611  }
612 
613  qCDebug(SONNET_LOG_CORE) << "Unable to find any suggestion for" << text;
614 
615  // Not even any suggestions, give up
616  return QString();
617 }
618 
619 void GuessLanguage::setLimits(int maxItems, double minConfidence)
620 {
621  d->m_maxItems = maxItems;
622  d->m_minConfidence = minConfidence;
623 }
624 
625 void GuessLanguagePrivate::loadModels()
626 {
627  // use trigrams from resource file, easy to deploy on all platforms
628  const QString triMapFile = QStringLiteral(":/org.kde.sonnet/trigrams.map");
629  qCDebug(SONNET_LOG_CORE) << "Loading trigrams from" << triMapFile;
630 
631  QFile sin(triMapFile);
632  if (!sin.open(QIODevice::ReadOnly)) {
633  qCWarning(SONNET_LOG_CORE) << "Sonnet: Unable to load trigram models from file" << triMapFile;
634  return;
635  }
636 
637  QDataStream in(&sin);
638  in >> s_knownModels;
639 
640  // Sanity check
641  QSet<QString> availableLanguages;
642  QHashIterator<QString, QHash<QString, int>> iterator(s_knownModels);
643  while (iterator.hasNext()) {
644  iterator.next();
645  if (iterator.value().count() < MAXGRAMS) {
646  qCWarning(SONNET_LOG_CORE) << iterator.key() << "is has only" << iterator.value().count() << "trigrams, expected" << MAXGRAMS;
647  }
648  availableLanguages.insert(iterator.key());
649  }
650  QSet<QString> knownLanguages(s_scriptLanguages.constBegin(), s_scriptLanguages.constEnd());
651  knownLanguages.subtract(availableLanguages);
652  if (!knownLanguages.isEmpty()) {
653  qCDebug(SONNET_LOG_CORE) << "Missing trigrams for languages:" << knownLanguages;
654  }
655 }
656 
657 QList<QChar::Script> GuessLanguagePrivate::findRuns(const QString &text)
658 {
659  QHash<QChar::Script, int> scriptCounts;
660 
661  int totalCount = 0;
662 
663  for (const QChar c : text) {
664  const QChar::Script script = c.script();
665 
666  if (script == QChar::Script_Common || script == QChar::Script_Inherited) {
667  continue;
668  }
669 
670  if (!c.isLetter()) {
671  continue;
672  }
673 
674  scriptCounts[script]++;
675  totalCount++;
676  }
677 
678  QList<QChar::Script> relevantScripts;
679 
680  if (totalCount == 0) {
681  return relevantScripts;
682  }
683 
684  if (scriptCounts.size() == 1) {
685  return {scriptCounts.cbegin().key()};
686  }
687 
688  for (auto it = scriptCounts.cbegin(); it != scriptCounts.cend(); ++it) {
689  // return run types that used for 40% or more of the string
690  const int scriptCount = it.value();
691  const auto currentScript = it.key();
692  if (scriptCount * 100 / totalCount >= 40) {
693  relevantScripts << currentScript;
694  // always return basic latin if found more than 15%.
695  } else if (currentScript == QChar::Script_Latin && scriptCount * 100 / totalCount >= 15) {
696  relevantScripts << currentScript;
697  }
698  }
699 
700  return relevantScripts;
701 }
702 
703 QStringList GuessLanguagePrivate::identify(const QString &sample, const QList<QChar::Script> &scripts)
704 {
705  if (sample.size() < MIN_LENGTH) {
706  return QStringList();
707  }
708 
709  QStringList guesses;
710  for (const QChar::Script script : scripts) {
711  guesses.append(guessFromTrigrams(sample, s_scriptLanguages.values(script)));
712  }
713 
714  return guesses;
715 }
716 
717 QStringList GuessLanguagePrivate::guessFromTrigrams(const QString &sample, const QStringList &languages)
718 {
719  QStringList ret;
720 
721  const QVector<QString> sampleTrigrams = createOrderedModel(sample);
722 
723  // Sort by score
725  for (const QString &language : languages) {
726  if (s_knownModels.contains(language)) {
727  scores.insert(distance(sampleTrigrams, s_knownModels[language]), language);
728  }
729  }
730 
731  // Skip if either no results or best result is completely unknown (distance >= maxdistance)
732  if (scores.isEmpty() || scores.firstKey() >= MAXGRAMS * sampleTrigrams.size()) {
733  qCDebug(SONNET_LOG_CORE) << "No scores for" << sample;
734  return ret;
735  }
736 
737  int counter = 0;
738  double confidence = 0;
739 
740 #if QT_VERSION >= QT_VERSION_CHECK(6, 0, 0)
741  QMultiMapIterator<int, QString> it(scores);
742 #else
743  QMapIterator<int, QString> it(scores);
744 #endif
745  it.next();
746 
747  QString prevItem = it.value();
748  int prevScore = it.key();
749 
750  while (it.hasNext() && counter < m_maxItems && confidence < m_minConfidence) {
751  it.next();
752  counter++;
753  confidence += (it.key() - prevScore) / (double)it.key();
754  ret += prevItem;
755  prevItem = it.value();
756  prevScore = it.key();
757  }
758  if (counter < m_maxItems && confidence < m_minConfidence) {
759  ret += prevItem;
760  }
761 
762  return ret;
763 }
764 
765 QVector<QString> GuessLanguagePrivate::createOrderedModel(const QString &content)
766 {
767  QHash<QString, int> trigramCounts;
768 
769  // collect trigrams
770  trigramCounts.reserve(content.size() - 2);
771  for (int i = 0; i < (content.size() - 2); ++i) {
772  QString tri = content.mid(i, 3).toLower();
773  trigramCounts[tri]++;
774  }
775 
776  // invert the map <freq, trigram>
777  QVector<QPair<int, QString>> trigramFrequencyList;
778  trigramFrequencyList.reserve(trigramCounts.size());
779 
780  auto it = trigramCounts.constBegin();
781  for (; it != trigramCounts.constEnd(); ++it) {
782  const QChar *data = it.key().constData();
783  bool hasTwoSpaces = (data[1].isSpace() && (data[0].isSpace() || data[2].isSpace()));
784 
785  if (!hasTwoSpaces) {
786  const int freq = it.value();
787  const QString &trigram = it.key();
788  trigramFrequencyList.append({freq, trigram});
789  }
790  }
791 
792  // sort descending by frequency
793  std::sort(trigramFrequencyList.begin(), trigramFrequencyList.end(), [](const QPair<int, QString> &a, const QPair<int, QString> &b) {
794  return a.first > b.first;
795  });
796 
797  QVector<QString> orderedTrigrams;
798  orderedTrigrams.reserve(trigramFrequencyList.size());
799  for (const auto &tri : std::as_const(trigramFrequencyList)) {
800  orderedTrigrams.append(tri.second);
801  }
802 
803  return orderedTrigrams;
804 }
805 
806 int GuessLanguagePrivate::distance(const QVector<QString> &model, const QHash<QString, int> &knownModel)
807 {
808  int counter = -1;
809  int dist = 0;
810 
811  for (const QString &trigram : model) {
812  const int val = knownModel.value(trigram, -1);
813  if (val != -1) {
814  dist += qAbs(++counter - val);
815  } else {
816  dist += MAXGRAMS;
817  }
818 
819  if (counter == (MAXGRAMS - 1)) {
820  break;
821  }
822  }
823 
824  return dist;
825 }
826 
827 QString GuessLanguagePrivate::guessFromDictionaries(const QString &sentence, const QStringList &candidates)
828 {
829  // Try to see how many languages we can get spell checking for
831  for (const QString &lang : candidates) {
832  if (!Loader::openLoader()->languages().contains(lang)) {
833  qCWarning(SONNET_LOG_CORE) << "Dictionary asked for invalid speller" << lang;
834  continue;
835  }
836  QSharedPointer<SpellerPlugin> plugin = Loader::openLoader()->cachedSpeller(lang);
837  if (!plugin.isNull()) {
838  spellers.append(plugin);
839  }
840  }
841 
842  // If there's no spell checkers, give up
843  if (spellers.isEmpty()) {
844  return QString();
845  }
846 
847  QMap<QString, int> correctHits;
848 
849  WordTokenizer tokenizer(sentence);
850  while (tokenizer.hasNext()) {
851  Token word = tokenizer.next();
852  if (!tokenizer.isSpellcheckable()) {
853  continue;
854  }
855 
856  for (int i = 0; i < spellers.count(); ++i) {
857  if (spellers[i]->isCorrect(word.toString())) {
858  correctHits[spellers[i]->language()]++;
859  }
860  }
861  }
862 
863  if (correctHits.isEmpty()) {
864  return QString();
865  }
866 
868  for (QMap<QString, int>::const_iterator itr = correctHits.constBegin(); itr != correctHits.constEnd(); ++itr) {
869  if (itr.value() > max.value()) {
870  max = itr;
871  }
872  }
873  return max.key();
874 }
875 }
void append(const T &value)
QMap::const_iterator constBegin() const const
T & first()
const T value(const Key &key) const const
GuessLanguage()
Constructor Creates a new GuessLanguage instance.
const Key & firstKey() const const
int size() const const
bool isNull() const const
QVector::iterator begin()
int count(const T &value) const const
bool contains(const QString &str, Qt::CaseSensitivity cs) const const
void append(const T &value)
QList::const_iterator constBegin() const const
QString identify(const QString &text, const QStringList &suggestions=QStringList()) const
Returns the 2 digit ISO 639-1 code for the language of the currently set text and.
QHash::const_iterator cend() const const
void reserve(int alloc)
QList< QLocale > matchingLocales(QLocale::Language language, QLocale::Script script, QLocale::Country country)
int size() const const
KOSM_EXPORT double distance(const std::vector< const OSM::Node * > &path, Coordinate coord)
bool isSpace() const const
QHash::const_iterator constBegin() const const
QHash::const_iterator constEnd() const const
QMap::const_iterator constEnd() const const
void reserve(int size)
bool isEmpty() const const
QString name() const const
bool isEmpty() const const
typename QMap< Key, T >::iterator insert(const Key &key, const T &value)
~GuessLanguage()
Destructor.
void reserve(int size)
QStringList knownLanguages(Types::ComponentTypes types)
void insert(int i, const T &value)
bool contains(const T &value) const const
LocaleWrapper locale()
The sonnet namespace.
int removeDuplicates()
QVector::iterator end()
QString toLower() const const
QList::const_iterator constEnd() const const
QSet< T > & unite(const QSet< T > &other)
QSet::iterator insert(const T &value)
QList::iterator begin()
int size() const const
QHash::const_iterator cbegin() const const
QList::iterator end()
QString mid(int position, int n) const const
void setLimits(int maxItems, double minConfidence)
Sets limits to number of languages returned by identify().
bool isEmpty() const const
This file is part of the KDE documentation.
Documentation copyright © 1996-2023 The KDE developers.
Generated on Mon Dec 4 2023 03:59:00 by doxygen 1.8.17 written by Dimitri van Heesch, © 1997-2006

KDE's Doxygen guidelines are available online.