• Skip to content
  • Skip to link menu
KDE API Reference
  • KDE API Reference
  • kdelibs API Reference
  • KDE Home
  • Contact Us
 

KDECore

  • sources
  • kde-4.12
  • kdelibs
  • kdecore
  • localization
kencodingprober.cpp
Go to the documentation of this file.
1 /*
2  This file is part of the KDE libraries
3 
4  Copyright (C) 2008 Wang Hoi (zealot.hoi@gmail.com)
5 
6  This library is free software; you can redistribute it and/or
7  modify it under the terms of the GNU Library General Public
8  License as published by the Free Software Foundation; either
9  version 2 of the License, or (at your option) any later version.
10 
11  This library is distributed in the hope that it will be useful,
12  but WITHOUT ANY WARRANTY; without even the implied warranty of
13  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14  Library General Public License for more details.
15 
16  You should have received a copy of the GNU Library General Public License
17  along with this library; see the file COPYING.LIB. If not, write to
18  the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor,
19  Boston, MA 02110-1301, USA.
20 
21 */
22 
23 #include "kencodingprober.h"
24 
25 #include "klocale.h"
26 
27 #include "probers/nsCharSetProber.h"
28 #include "probers/nsUniversalDetector.h"
29 #include "probers/ChineseGroupProber.h"
30 #include "probers/JapaneseGroupProber.h"
31 #include "probers/UnicodeGroupProber.h"
32 #include "probers/nsSBCSGroupProber.h"
33 #include "probers/nsMBCSGroupProber.h"
34 
35 #include <string.h>
36 
37 class KEncodingProberPrivate
38 {
39 public:
40  KEncodingProberPrivate(): prober(NULL), mStart(true) {};
41  ~KEncodingProberPrivate()
42  {
43  delete prober;
44  }
45  void setProberType(KEncodingProber::ProberType pType)
46  {
47  proberType = pType;
48  /* handle multi-byte encodings carefully , because they're hard to detect,
49  * and have to use some Stastics methods.
50  * for single-byte encodings (most western encodings), nsSBCSGroupProber is ok,
51  * because encoding state machine can detect many such encodings.
52  */
53 
54  delete prober;
55 
56  switch (proberType) {
57  case KEncodingProber::None:
58  prober = NULL;
59  break;
60  case KEncodingProber::Arabic:
61  case KEncodingProber::Baltic:
62  case KEncodingProber::CentralEuropean:
63  case KEncodingProber::Cyrillic:
64  case KEncodingProber::Greek:
65  case KEncodingProber::Hebrew:
66  case KEncodingProber::NorthernSaami:
67  case KEncodingProber::Other:
68  case KEncodingProber::SouthEasternEurope:
69  case KEncodingProber::Thai:
70  case KEncodingProber::Turkish:
71  case KEncodingProber::WesternEuropean:
72  prober = new kencodingprober::nsSBCSGroupProber();
73  break;
74  case KEncodingProber::ChineseSimplified:
75  case KEncodingProber::ChineseTraditional:
76  prober = new kencodingprober::ChineseGroupProber();
77  break;
78  case KEncodingProber::Japanese:
79  prober = new kencodingprober::JapaneseGroupProber();
80  break;
81  case KEncodingProber::Korean:
82  prober = new kencodingprober::nsMBCSGroupProber();
83  break;
84  case KEncodingProber::Unicode:
85  prober = new kencodingprober::UnicodeGroupProber();
86  break;
87  case KEncodingProber::Universal:
88  prober = new kencodingprober::nsUniversalDetector();
89  break;
90  default:
91  prober = NULL;
92  }
93  }
94  void unicodeTest(const char *aBuf, int aLen)
95  {
96  if (mStart)
97  {
98  mStart = false;
99  if (aLen > 3)
100  switch (aBuf[0])
101  {
102  case '\xEF':
103  if (('\xBB' == aBuf[1]) && ('\xBF' == aBuf[2]))
104  // EF BB BF UTF-8 encoded BOM
105  proberState = KEncodingProber::FoundIt;
106  break;
107  case '\xFE':
108  if (('\xFF' == aBuf[1]) && ('\x00' == aBuf[2]) && ('\x00' == aBuf[3]))
109  // FE FF 00 00 UCS-4, unusual octet order BOM (3412)
110  proberState = KEncodingProber::FoundIt;
111  else if ('\xFF' == aBuf[1])
112  // FE FF UTF-16, big endian BOM
113  proberState = KEncodingProber::FoundIt;
114  break;
115  case '\x00':
116  if (('\x00' == aBuf[1]) && ('\xFE' == aBuf[2]) && ('\xFF' == aBuf[3]))
117  // 00 00 FE FF UTF-32, big-endian BOM
118  proberState = KEncodingProber::FoundIt;
119  else if (('\x00' == aBuf[1]) && ('\xFF' == aBuf[2]) && ('\xFE' == aBuf[3]))
120  // 00 00 FF FE UCS-4, unusual octet order BOM (2143)
121  proberState = KEncodingProber::FoundIt;
122  break;
123  case '\xFF':
124  if (('\xFE' == aBuf[1]) && ('\x00' == aBuf[2]) && ('\x00' == aBuf[3]))
125  // FF FE 00 00 UTF-32, little-endian BOM
126  proberState = KEncodingProber::FoundIt;
127  else if ('\xFE' == aBuf[1])
128  // FF FE UTF-16, little endian BOM
129  proberState = KEncodingProber::FoundIt;
130  break;
131  } // switch
132 
133  }
134  }
135  KEncodingProber::ProberType proberType;
136  KEncodingProber::ProberState proberState;
137  kencodingprober::nsCharSetProber *prober;
138  bool mStart;
139 };
140 
141 KEncodingProber::KEncodingProber(KEncodingProber::ProberType proberType): d(new KEncodingProberPrivate())
142 {
143  setProberType(proberType);
144 }
145 
146 KEncodingProber::~KEncodingProber()
147 {
148  delete d;
149 }
150 
151 void KEncodingProber::reset()
152 {
153  d->proberState = KEncodingProber::Probing;
154  d->mStart = true;
155 }
156 
157 KEncodingProber::ProberState KEncodingProber::feed(const QByteArray &data)
158 {
159  return feed(data.data(), data.size());
160 }
161 
162 KEncodingProber::ProberState KEncodingProber::feed(const char* data, int len)
163 {
164  if (!d->prober)
165  return d->proberState;
166  if (d->proberState == Probing) {
167  if (d->mStart) {
168  d->unicodeTest(data, len);
169  if (d->proberState == FoundIt)
170  return d->proberState;
171  }
172  d->prober->HandleData(data, len);
173  switch (d->prober->GetState())
174  {
175  case kencodingprober::eNotMe:
176  d->proberState = NotMe;
177  break;
178  case kencodingprober::eFoundIt:
179  d->proberState = FoundIt;
180  break;
181  default:
182  d->proberState = Probing;
183  break;
184  }
185  }
186 #ifdef DEBUG_PROBE
187  d->prober->DumpStatus();
188 #endif
189  return d->proberState;
190 }
191 
192 KEncodingProber::ProberState KEncodingProber::state() const
193 {
194  return d->proberState;
195 }
196 
197 //DEPRECATED, do *not* use
198 #ifndef KDE_NO_DEPRECATED
199 const char* KEncodingProber::encodingName() const
200 {
201  return qstrdup(encoding().constData());
202 }
203 #endif
204 
205 QByteArray KEncodingProber::encoding() const
206 {
207  if (!d->prober)
208  return QByteArray("UTF-8");
209 
210  return QByteArray(d->prober->GetCharSetName());
211 }
212 
213 float KEncodingProber::confidence() const
214 {
215  if (!d->prober)
216  return 0.0;
217 
218  return d->prober->GetConfidence();
219 }
220 
221 KEncodingProber::ProberType KEncodingProber::proberType() const
222 {
223  return d->proberType;
224 }
225 
226 void KEncodingProber::setProberType(KEncodingProber::ProberType proberType)
227 {
228  d->setProberType(proberType);
229  reset();
230 }
231 
232 KEncodingProber::ProberType KEncodingProber::proberTypeForName(const QString& lang)
233 {
234  if (lang.isEmpty())
235  return KEncodingProber::Universal;
236  else if (lang==i18nc("@item Text character set", "Disabled"))
237  return KEncodingProber::None;
238  else if (lang==i18nc("@item Text character set", "Universal"))
239  return KEncodingProber::Universal;
240  else if (lang==i18nc("@item Text character set", "Unicode"))
241  return KEncodingProber::Unicode;
242  else if (lang==i18nc("@item Text character set", "Cyrillic"))
243  return KEncodingProber::Cyrillic;
244  else if (lang==i18nc("@item Text character set", "Western European"))
245  return KEncodingProber::WesternEuropean;
246  else if (lang==i18nc("@item Text character set", "Central European"))
247  return KEncodingProber::CentralEuropean;
248  else if (lang==i18nc("@item Text character set", "Greek"))
249  return KEncodingProber::Greek;
250  else if (lang==i18nc("@item Text character set", "Hebrew"))
251  return KEncodingProber::Hebrew;
252  else if (lang==i18nc("@item Text character set", "Turkish"))
253  return KEncodingProber::Turkish;
254  else if (lang==i18nc("@item Text character set", "Japanese"))
255  return KEncodingProber::Japanese;
256  else if (lang==i18nc("@item Text character set", "Baltic"))
257  return KEncodingProber::Baltic;
258  else if (lang==i18nc("@item Text character set", "Chinese Traditional"))
259  return KEncodingProber::ChineseTraditional;
260  else if (lang==i18nc("@item Text character set", "Chinese Simplified"))
261  return KEncodingProber::ChineseSimplified;
262  else if (lang==i18nc("@item Text character set", "Arabic"))
263  return KEncodingProber::Arabic;
264 
265  return KEncodingProber::Universal;
266 }
267 
268 QString KEncodingProber::nameForProberType(KEncodingProber::ProberType proberType)
269 {
270  switch (proberType)
271  {
272  case KEncodingProber::None:
273  return i18nc("@item Text character set", "Disabled");
274  break;
275  case KEncodingProber::Universal:
276  return i18nc("@item Text character set", "Universal");
277  break;
278  case KEncodingProber::Arabic:
279  return i18nc("@item Text character set", "Arabic");
280  break;
281  case KEncodingProber::Baltic:
282  return i18nc("@item Text character set", "Baltic");
283  break;
284  case KEncodingProber::CentralEuropean:
285  return i18nc("@item Text character set", "Central European");
286  break;
287  case KEncodingProber::Cyrillic:
288  return i18nc("@item Text character set", "Cyrillic");
289  break;
290  case KEncodingProber::Greek:
291  return i18nc("@item Text character set", "Greek");
292  break;
293  case KEncodingProber::Hebrew:
294  return i18nc("@item Text character set", "Hebrew");
295  break;
296  case KEncodingProber::Japanese:
297  return i18nc("@item Text character set", "Japanese");
298  break;
299  case KEncodingProber::Turkish:
300  return i18nc("@item Text character set", "Turkish");
301  break;
302  case KEncodingProber::WesternEuropean:
303  return i18nc("@item Text character set", "Western European");
304  break;
305  case KEncodingProber::ChineseTraditional:
306  return i18nc("@item Text character set", "Chinese Traditional");
307  break;
308  case KEncodingProber::ChineseSimplified:
309  return i18nc("@item Text character set", "Chinese Simplified");
310  break;
311  case KEncodingProber::Korean:
312  return i18nc("@item Text character set", "Korean");
313  break;
314  case KEncodingProber::Thai:
315  return i18nc("@item Text character set", "Thai");
316  break;
317  case KEncodingProber::Unicode:
318  return i18nc("@item Text character set", "Unicode");
319  break;
320  default:
321  return QString();
322  }
323 }
KEncodingProber::state
ProberState state() const
Definition: kencodingprober.cpp:192
KEncodingProber::Korean
Definition: kencodingprober.h:89
KEncodingProber::Cyrillic
Definition: kencodingprober.h:85
KEncodingProber::Baltic
Definition: kencodingprober.h:81
KEncodingProber::ChineseTraditional
Definition: kencodingprober.h:84
KEncodingProber::Unicode
Definition: kencodingprober.h:95
KEncodingProber::proberType
ProberType proberType() const
Definition: kencodingprober.cpp:221
ChineseGroupProber.h
KEncodingProber::~KEncodingProber
~KEncodingProber()
Definition: kencodingprober.cpp:146
nsUniversalDetector.h
KEncodingProber::CentralEuropean
Definition: kencodingprober.h:82
nsCharSetProber.h
KEncodingProber::NotMe
Sure not included in current ProberType's all supported encodings.
Definition: kencodingprober.h:73
KEncodingProber::feed
ProberState feed(const QByteArray &data)
The main class method.
Definition: kencodingprober.cpp:157
KEncodingProber::WesternEuropean
Definition: kencodingprober.h:96
QString
klocale.h
KEncodingProber::setProberType
void setProberType(ProberType proberType)
change current prober's ProberType and reset the prober
Definition: kencodingprober.cpp:226
KEncodingProber::Greek
Definition: kencodingprober.h:86
i18nc
QString i18nc(const char *ctxt, const char *text)
Returns a localized version of a string and a context.
Definition: klocalizedstring.h:797
KEncodingProber::ChineseSimplified
Definition: kencodingprober.h:83
kencodingprober::ChineseGroupProber
Definition: ChineseGroupProber.h:34
KEncodingProber::Hebrew
Definition: kencodingprober.h:87
kencodingprober::eFoundIt
Definition: nsCharSetProber.h:36
kencodingprober::UnicodeGroupProber
Definition: UnicodeGroupProber.h:34
KEncodingProber::Japanese
Definition: kencodingprober.h:88
kencodingprober.h
nsMBCSGroupProber.h
kencodingprober::nsCharSetProber
Definition: nsCharSetProber.h:42
KEncodingProber::KEncodingProber
KEncodingProber(ProberType proberType=Universal)
Default ProberType is Universal(detect all possibe encodings)
Definition: kencodingprober.cpp:141
KEncodingProber::confidence
float confidence() const
Definition: kencodingprober.cpp:213
KEncodingProber::Other
Definition: kencodingprober.h:91
kencodingprober::JapaneseGroupProber
Definition: JapaneseGroupProber.h:36
KEncodingProber::proberTypeForName
static ProberType proberTypeForName(const QString &lang)
Definition: kencodingprober.cpp:232
KEncodingProber::encoding
QByteArray encoding() const
Definition: kencodingprober.cpp:205
KEncodingProber::nameForProberType
static QString nameForProberType(ProberType proberType)
map ProberType to language string
Definition: kencodingprober.cpp:268
nsSBCSGroupProber.h
kencodingprober::eNotMe
Definition: nsCharSetProber.h:37
KEncodingProber::reset
void reset()
reset the prober's internal state and data.
Definition: kencodingprober.cpp:151
UnicodeGroupProber.h
KEncodingProber::Thai
Definition: kencodingprober.h:93
kencodingprober::nsUniversalDetector
Definition: nsUniversalDetector.h:40
KEncodingProber::Probing
Need more data to make a decision.
Definition: kencodingprober.h:74
KEncodingProber::Turkish
Definition: kencodingprober.h:94
KEncodingProber::ProberType
ProberType
Definition: kencodingprober.h:77
KEncodingProber::FoundIt
Sure find the encoding.
Definition: kencodingprober.h:72
KEncodingProber::ProberState
ProberState
Definition: kencodingprober.h:71
kencodingprober::nsMBCSGroupProber
Definition: nsMBCSGroupProber.h:39
kencodingprober::nsSBCSGroupProber
Definition: nsSBCSGroupProber.h:47
KEncodingProber::None
Definition: kencodingprober.h:78
JapaneseGroupProber.h
KEncodingProber::SouthEasternEurope
Definition: kencodingprober.h:92
KEncodingProber::NorthernSaami
Definition: kencodingprober.h:90
KEncodingProber::Arabic
Definition: kencodingprober.h:80
KEncodingProber::encodingName
const char * encodingName() const
Definition: kencodingprober.cpp:199
KEncodingProber::Universal
Definition: kencodingprober.h:79
This file is part of the KDE documentation.
Documentation copyright © 1996-2014 The KDE developers.
Generated on Tue Oct 14 2014 22:47:08 by doxygen 1.8.7 written by Dimitri van Heesch, © 1997-2006

KDE's Doxygen guidelines are available online.

KDECore

Skip menu "KDECore"
  • Main Page
  • Namespace List
  • Namespace Members
  • Alphabetical List
  • Class List
  • Class Hierarchy
  • Class Members
  • File List
  • File Members
  • Modules
  • Related Pages

kdelibs API Reference

Skip menu "kdelibs API Reference"
  • DNSSD
  • Interfaces
  •   KHexEdit
  •   KMediaPlayer
  •   KSpeech
  •   KTextEditor
  • kconf_update
  • KDE3Support
  •   KUnitTest
  • KDECore
  • KDED
  • KDEsu
  • KDEUI
  • KDEWebKit
  • KDocTools
  • KFile
  • KHTML
  • KImgIO
  • KInit
  • kio
  • KIOSlave
  • KJS
  •   KJS-API
  • kjsembed
  •   WTF
  • KNewStuff
  • KParts
  • KPty
  • Kross
  • KUnitConversion
  • KUtils
  • Nepomuk
  • Nepomuk-Core
  • Nepomuk
  • Plasma
  • Solid
  • Sonnet
  • ThreadWeaver

Search



Report problems with this website to our bug tracking system.
Contact the specific authors with questions and comments about the page contents.

KDE® and the K Desktop Environment® logo are registered trademarks of KDE e.V. | Legal