KHtml

kencodingdetector.h
1 /*
2  This file is part of the KDE libraries
3 
4  Copyright (C) 1999 Lars Knoll ([email protected])
5  Copyright (C) 2007 Nick Shaforostoff ([email protected])
6 
7  This library is free software; you can redistribute it and/or
8  modify it under the terms of the GNU Library General Public
9  License as published by the Free Software Foundation; either
10  version 2 of the License, or (at your option) any later version.
11 
12  This library is distributed in the hope that it will be useful,
13  but WITHOUT ANY WARRANTY; without even the implied warranty of
14  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15  Library General Public License for more details.
16 
17  You should have received a copy of the GNU Library General Public License
18  along with this library; see the file COPYING.LIB. If not, write to
19  the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor,
20  Boston, MA 02110-1301, USA.
21 
22 */
23 #ifndef KENCODINGDETECTOR_H
24 #define KENCODINGDETECTOR_H
25 
26 #include <QString>
27 
28 class QTextCodec;
29 class QTextDecoder;
30 class KEncodingDetectorPrivate;
31 
32 /**
33  * @short Provides encoding detection capabilities.
34  *
35  * Searches for encoding declaration inside raw data -- meta and xml tags.
36  * In the case it can't find it, uses heuristics for specified language.
37  *
38  * If it finds unicode BOM marks, it changes encoding regardless of what the user has told
39  *
40  * Intended lifetime of the object: one instance per document.
41  *
42  * Typical use:
43  * \code
44  * QByteArray data;
45  * ...
46  * KEncodingDetector detector;
47  * detector.setAutoDetectLanguage(KEncodingDetector::Cyrillic);
48  * QString out=detector.decode(data);
49  * \endcode
50  *
51  *
52  * Do not mix decode() with decodeWithBuffering()
53  *
54  * @short Guess encoding of char array
55  *
56  */
58 {
59 public:
60  enum EncodingChoiceSource {
61  DefaultEncoding,
62  AutoDetectedEncoding,
63  BOM,
64  EncodingFromXMLHeader,
65  EncodingFromMetaTag,
66  EncodingFromHTTPHeader,
67  UserChosenEncoding
68  };
69 
70  enum AutoDetectScript {
71  None,
72  SemiautomaticDetection,
73  Arabic,
74  Baltic,
75  CentralEuropean,
76  ChineseSimplified,
77  ChineseTraditional,
78  Cyrillic,
79  Greek,
80  Hebrew,
81  Japanese,
82  Korean,
83  NorthernSaami,
84  SouthEasternEurope,
85  Thai,
86  Turkish,
87  Unicode,
88  WesternEuropean
89  };
90 
91  /**
92  * Default codec is latin1 (as html spec says), EncodingChoiceSource is default, AutoDetectScript=Semiautomatic
93  */
95 
96  /**
97  * Allows to set Default codec, EncodingChoiceSource, AutoDetectScript
98  */
99  KEncodingDetector(QTextCodec *codec, EncodingChoiceSource source, AutoDetectScript script = None);
101 
102  //const QTextCodec* codec() const;
103 
104  /**
105  * @returns true if specified encoding was recognized
106  */
107  bool setEncoding(const char *encoding, EncodingChoiceSource type);
108 
109  /**
110  * Convenience method.
111  * @returns mime name of detected encoding
112  */
113  const char *encoding() const;
114 
115  bool visuallyOrdered() const;
116 
117 // void setAutoDetectLanguage( const QString& );
118 // const QString& autoDetectLanguage() const;
119 
120  void setAutoDetectLanguage(AutoDetectScript);
121  AutoDetectScript autoDetectLanguage() const;
122 
123  EncodingChoiceSource encodingChoiceSource() const;
124 
125  /**
126  * The main class method
127  *
128  * Calls protected analyze() only the first time of the whole object life
129  *
130  * Replaces all null chars with spaces.
131  */
132  QString decode(const char *data, int len);
133  QString decode(const QByteArray &data);
134 
135  //* You don't need to call analyze() if you use this method.
136  /**
137  * Convenience method that uses buffering. It waits for full html head to be buffered
138  * (i.e. calls analyze every time until it returns true).
139  *
140  * Replaces all null chars with spaces.
141  *
142  * @returns Decoded data, or empty string, if there was not enough data for accurate detection
143  * @see flush()
144  */
145  QString decodeWithBuffering(const char *data, int len);
146 
147  /**
148  * This method checks whether invalid characters were found
149  * during a decoding operation.
150  *
151  * Note that this bit is never reset once invalid characters have been found.
152  * To force a reset, either change the encoding using setEncoding() or call
153  * resetDecoder()
154  *
155  * @returns a boolean reflecting said state.
156  * @since 4.3
157  * @see resetDecoder() setEncoding()
158  */
159  bool decodedInvalidCharacters() const;
160 
161  /**
162  * Resets the decoder. Any stateful decoding information (such as resulting from previous calls
163  * to decodeWithBuffering()) will be lost.
164  * Will Reset the state of decodedInvalidCharacters() as a side effect.
165  *
166  * @since 4.3
167  * @see decodeWithBuffering() decodedInvalidCharacters()
168  *
169  */
170  void resetDecoder();
171 
172  /**
173  * Convenience method to be used with decodeForHtml. Flushes buffer.
174  * @see decodeForHtml()
175  */
176  QString flush();
177 
178  /**
179  * Takes lang name _after_ it were i18n()'ed
180  */
181  static AutoDetectScript scriptForName(const QString &lang);
182  static QString nameForScript(AutoDetectScript);
183  static bool hasAutoDetectionForScript(AutoDetectScript);
184 
185 protected:
186  /**
187  * This nice method will kill all 0 bytes (or double bytes)
188  * and remember if this was a binary or not ;)
189  */
190  bool processNull(char *data, int length);
191 
192  /**
193  * Check if we are really utf8. Taken from kate
194  *
195  * @returns true if current encoding is utf8 and the text cannot be in this encoding
196  *
197  * Please somebody read https://en.wikipedia.org/wiki/UTF-8 and check this code...
198  */
199  bool errorsIfUtf8(const char *data, int length);
200 
201  /**
202  * Analyze text data.
203  * @returns true if there was enough data for accurate detection
204  */
205  bool analyze(const char *data, int len);
206 
207  /**
208  * @returns QTextDecoder for detected encoding
209  */
211 
212 private:
213  KEncodingDetectorPrivate *const d;
214 };
215 
216 #endif
bool decodedInvalidCharacters() const
This method checks whether invalid characters were found during a decoding operation.
static AutoDetectScript scriptForName(const QString &lang)
Takes lang name after it were i18n()'ed.
QString decodeWithBuffering(const char *data, int len)
Convenience method that uses buffering.
KEncodingDetector()
Default codec is latin1 (as html spec says), EncodingChoiceSource is default, AutoDetectScript=Semiau...
QString flush()
Convenience method to be used with decodeForHtml.
QTextDecoder * decoder()
const char * encoding() const
Convenience method.
bool processNull(char *data, int length)
This nice method will kill all 0 bytes (or double bytes) and remember if this was a binary or not ;)
void resetDecoder()
Resets the decoder.
bool setEncoding(const char *encoding, EncodingChoiceSource type)
QString decode(const char *data, int len)
The main class method.
bool errorsIfUtf8(const char *data, int length)
Check if we are really utf8.
Provides encoding detection capabilities.
bool analyze(const char *data, int len)
Analyze text data.
This file is part of the KDE documentation.
Documentation copyright © 1996-2022 The KDE developers.
Generated on Thu Oct 6 2022 04:15:20 by doxygen 1.8.17 written by Dimitri van Heesch, © 1997-2006

KDE's Doxygen guidelines are available online.