Sonnet

guesslanguage.h
1 /* This file is part of the KDE libraries
2  SPDX-FileCopyrightText: 2006 Jacob R Rideout <[email protected]>
3 
4  SPDX-License-Identifier: LGPL-2.0-or-later
5 */
6 
7 #ifndef GUESSLANGUAGE_H
8 #define GUESSLANGUAGE_H
9 
10 #include <QString>
11 #include <QStringList>
12 
13 #include "sonnetcore_export.h"
14 
15 namespace Sonnet
16 {
17 // Amount of trigrams in each file
18 static const int MAXGRAMS = 300;
19 
20 class GuessLanguagePrivate;
21 
22 /**
23  * @class Sonnet::GuessLanguage guesslanguage.h <Sonnet/GuessLanguage>
24  *
25  * @short GuessLanguage determines the language of a given text.
26  *
27  * GuessLanguage can determine the difference between ~75 languages for a given string. It is
28  * based off a Perl script originally written by Maciej Ceglowski <[email protected]>
29  * called Languid. His script used a 2 part heuristic to determine language. First the text
30  * is checked for the scripts it contains, then for each set of languages using those
31  * scripts a n-gram frequency model of a given language is compared to a model of the text.
32  * The most similar language model is assumed to be the language. If no language is found
33  * an empty string is returned.
34  *
35  *
36  * @author Jacob Rideout <[email protected]>
37  * @since 4.3
38  */
39 class SONNETCORE_EXPORT GuessLanguage
40 {
41 public:
42  /** Constructor
43  * Creates a new GuessLanguage instance. If @p text is specified,
44  * it sets the text to be checked.
45  * @param text the text that is to be checked
46  */
47  GuessLanguage();
48 
49  /** Destructor
50  */
51  ~GuessLanguage();
52 
53  GuessLanguage(const GuessLanguage &) = delete;
54  GuessLanguage &operator=(const GuessLanguage &) = delete;
55 
56  /**
57  * Sets limits to number of languages returned by identify(). The confidence for each language is computed
58  * as difference between this and next language on the list normalized to 0-1 range. Reasonable value to get
59  * fairly sure result is 0.1 . Default is returning best guess without caring about confidence - exactly
60  * as after call to setLimits(1,0).
61  * @param maxItems The list returned by identify() will never have more than maxItems item
62  * @param minConfidence The list will have only enough items for their summary confidence equal
63  * or exceed minConfidence.
64  */
65  void setLimits(int maxItems, double minConfidence);
66 
67  /**
68  * Returns the 2 digit ISO 639-1 code for the language of the currently
69  * set text and. Three digits are returned only in the case where a 2 digit
70  * code does not exist. If @p text isn't empty, set the text to checked.
71  * @param text to be identified
72  * @return list of the presumed languages of the text, sorted by decreasing confidence. Empty list means
73  * it is impossible to determine language with confidence required by setLimits
74  */
75  QString identify(const QString &text, const QStringList &suggestions = QStringList()) const;
76 
77 private:
78  GuessLanguagePrivate *const d;
79 };
80 }
81 
82 #endif
The sonnet namespace.
GuessLanguage determines the language of a given text.
Definition: guesslanguage.h:39
This file is part of the KDE documentation.
Documentation copyright © 1996-2023 The KDE developers.
Generated on Thu Dec 7 2023 04:06:35 by doxygen 1.8.17 written by Dimitri van Heesch, © 1997-2006

KDE's Doxygen guidelines are available online.