• Skip to content
  • Skip to link menu
KDE API Reference
  • KDE API Reference
  • kdelibs API Reference
  • KDE Home
  • Contact Us
 

KDECore

  • sources
  • kde-4.12
  • kdelibs
  • kdecore
  • localization
  • probers
nsUniversalDetector.cpp
Go to the documentation of this file.
1 /* -*- Mode: C++; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
2 /* -*- C++ -*-
3 * Copyright (C) 1998 <developer@mozilla.org>
4 * Copyright (C) 2008 <wkai@gmail.com>
5 *
6 * Permission is hereby granted, free of charge, to any person obtaining
7 * a copy of this software and associated documentation files (the
8 * "Software"), to deal in the Software without restriction, including
9 * without limitation the rights to use, copy, modify, merge, publish,
10 * distribute, sublicense, and/or sell copies of the Software, and to
11 * permit persons to whom the Software is furnished to do so, subject to
12 * the following conditions:
13 *
14 * The above copyright notice and this permission notice shall be included
15 * in all copies or substantial portions of the Software.
16 *
17 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
18 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
19 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
20 * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
21 * LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
22 * OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
23 * WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
24 */
25 
26 #include "nsUniversalDetector.h"
27 
28 #include "nsMBCSGroupProber.h"
29 #include "nsSBCSGroupProber.h"
30 #include "nsEscCharsetProber.h"
31 #include "nsLatin1Prober.h"
32 
33 namespace kencodingprober {
34 nsUniversalDetector::nsUniversalDetector()
35 {
36  mDone = false;
37  mBestGuess = -1; //illegal value as signal
38  mInTag = false;
39  mEscCharSetProber = 0;
40 
41  mStart = true;
42  mDetectedCharset = 0;
43  mGotData = false;
44  mInputState = ePureAscii;
45  mLastChar = '\0';
46 
47  unsigned int i;
48  for (i = 0; i < NUM_OF_CHARSET_PROBERS; i++)
49  mCharSetProbers[i] = 0;
50 }
51 
52 nsUniversalDetector::~nsUniversalDetector()
53 {
54  for (int i = 0; i < NUM_OF_CHARSET_PROBERS; i++)
55  delete mCharSetProbers[i];
56  delete mEscCharSetProber;
57 }
58 
59 void
60 nsUniversalDetector::Reset()
61 {
62  mDone = false;
63  mBestGuess = -1; //illegal value as signal
64  mInTag = false;
65 
66  mStart = true;
67  mDetectedCharset = 0;
68  mGotData = false;
69  mInputState = ePureAscii;
70  mLastChar = '\0';
71 
72  if (mEscCharSetProber)
73  mEscCharSetProber->Reset();
74 
75  unsigned int i;
76  for (i = 0; i < NUM_OF_CHARSET_PROBERS; i++)
77  if (mCharSetProbers[i])
78  mCharSetProbers[i]->Reset();
79 }
80 
81 //---------------------------------------------------------------------
82 #define SHORTCUT_THRESHOLD (float)0.95
83 #define MINIMUM_THRESHOLD (float)0.20
84 
85 nsProbingState nsUniversalDetector::HandleData(const char* aBuf, unsigned int aLen)
86 {
87  if(mDone)
88  return eFoundIt;
89 
90  if (aLen > 0)
91  mGotData = true;
92 
93  unsigned int i;
94  for (i = 0; i < aLen; i++)
95  {
96  //other than 0xa0, if every othe character is ascii, the page is ascii
97  if (aBuf[i] & '\x80' && aBuf[i] != '\xA0') //Since many Ascii only page contains NBSP
98  {
99  //we got a non-ascii byte (high-byte)
100  if (mInputState != eHighbyte)
101  {
102  //adjust state
103  mInputState = eHighbyte;
104 
105  //kill mEscCharSetProber if it is active
106  delete mEscCharSetProber;
107  mEscCharSetProber = 0;
108 
109  //start multibyte and singlebyte charset prober
110  if (0 == mCharSetProbers[0])
111  mCharSetProbers[0] = new nsMBCSGroupProber;
112  if (0 == mCharSetProbers[1])
113  mCharSetProbers[1] = new nsSBCSGroupProber;
114  if (0 == mCharSetProbers[2])
115  mCharSetProbers[2] = new nsLatin1Prober;
116  }
117  }
118  else
119  {
120  //ok, just pure ascii so far
121  if ( ePureAscii == mInputState &&
122  (aBuf[i] == '\033' || (aBuf[i] == '{' && mLastChar == '~')) )
123  {
124  //found escape character or HZ "~{"
125  mInputState = eEscAscii;
126  }
127 
128  mLastChar = aBuf[i];
129  }
130  }
131 
132  nsProbingState st = eDetecting;
133  switch (mInputState)
134  {
135  case eEscAscii:
136  if (0 == mEscCharSetProber) {
137  mEscCharSetProber = new nsEscCharSetProber;
138  }
139  st = mEscCharSetProber->HandleData(aBuf, aLen);
140  if (st == eFoundIt)
141  {
142  mDone = true;
143  mDetectedCharset = mEscCharSetProber->GetCharSetName();
144  }
145  break;
146  case eHighbyte:
147  for (i = 0; i < NUM_OF_CHARSET_PROBERS; ++i)
148  {
149  st = mCharSetProbers[i]->HandleData(aBuf, aLen);
150  if (st == eFoundIt)
151  {
152  mDone = true;
153  mDetectedCharset = mCharSetProbers[i]->GetCharSetName();
154  }
155  }
156  break;
157 
158  default: //pure ascii
159  mDetectedCharset = "UTF-8";
160  }
161  return st;
162 }
163 
164 
165 //---------------------------------------------------------------------
166 const char* nsUniversalDetector::GetCharSetName()
167 {
168  if (mDetectedCharset)
169  return mDetectedCharset;
170  switch (mInputState)
171  {
172  case eHighbyte:
173  {
174  float proberConfidence;
175  float maxProberConfidence = (float)0.0;
176  int maxProber = 0;
177 
178  for (int i = 0; i < NUM_OF_CHARSET_PROBERS; i++)
179  {
180  proberConfidence = mCharSetProbers[i]->GetConfidence();
181  if (proberConfidence > maxProberConfidence)
182  {
183  maxProberConfidence = proberConfidence;
184  maxProber = i;
185  }
186  }
187  //do not report anything because we are not confident of it, that's in fact a negative answer
188  if (maxProberConfidence > MINIMUM_THRESHOLD)
189  return mCharSetProbers[maxProber]->GetCharSetName();
190  }
191  case eEscAscii:
192  break;
193  default: // pure ascii
194  ;
195  }
196  return "UTF-8";
197 
198 }
199 
200 //---------------------------------------------------------------------
201 float nsUniversalDetector::GetConfidence()
202 {
203  if (!mGotData)
204  {
205  // we haven't got any data yet, return immediately
206  // caller program sometimes call DataEnd before anything has been sent to detector
207  return MINIMUM_THRESHOLD;
208  }
209  if (mDetectedCharset)
210  return 0.99f;
211  switch (mInputState)
212  {
213  case eHighbyte:
214  {
215  float proberConfidence;
216  float maxProberConfidence = (float)0.0;
217  int maxProber = 0;
218 
219  for (int i = 0; i < NUM_OF_CHARSET_PROBERS; i++)
220  {
221  proberConfidence = mCharSetProbers[i]->GetConfidence();
222  if (proberConfidence > maxProberConfidence)
223  {
224  maxProberConfidence = proberConfidence;
225  maxProber = i;
226  }
227  }
228  //do not report anything because we are not confident of it, that's in fact a negative answer
229  if (maxProberConfidence > MINIMUM_THRESHOLD)
230  return mCharSetProbers[maxProber]->GetConfidence();
231  }
232  case eEscAscii:
233  break;
234  default: // pure ascii
235  ;
236  }
237  return MINIMUM_THRESHOLD;
238 }
239 
240 nsProbingState nsUniversalDetector::GetState()
241 {
242  if (mDone)
243  return eFoundIt;
244  else
245  return eDetecting;
246 }
247 }
248 
249 
kencodingprober::nsCharSetProber::GetConfidence
virtual float GetConfidence(void)=0
kencodingprober::nsLatin1Prober
Definition: nsLatin1Prober.h:33
nsEscCharsetProber.h
kencodingprober::nsUniversalDetector::GetConfidence
float GetConfidence(void)
Definition: nsUniversalDetector.cpp:201
kencodingprober::nsUniversalDetector::GetState
nsProbingState GetState()
Definition: nsUniversalDetector.cpp:240
nsUniversalDetector.h
kencodingprober::nsUniversalDetector::mCharSetProbers
nsCharSetProber * mCharSetProbers[NUM_OF_CHARSET_PROBERS]
Definition: nsUniversalDetector.h:61
kencodingprober::nsUniversalDetector::mGotData
bool mGotData
Definition: nsUniversalDetector.h:56
kencodingprober::nsUniversalDetector::mStart
bool mStart
Definition: nsUniversalDetector.h:55
kencodingprober::nsCharSetProber::Reset
virtual void Reset(void)=0
kencodingprober::nsUniversalDetector::Reset
void Reset(void)
Definition: nsUniversalDetector.cpp:60
kencodingprober::nsUniversalDetector::GetCharSetName
const char * GetCharSetName()
Definition: nsUniversalDetector.cpp:166
kencodingprober::nsCharSetProber::HandleData
virtual nsProbingState HandleData(const char *aBuf, unsigned int aLen)=0
kencodingprober::nsUniversalDetector::mEscCharSetProber
nsCharSetProber * mEscCharSetProber
Definition: nsUniversalDetector.h:62
kencodingprober::eFoundIt
Definition: nsCharSetProber.h:36
NUM_OF_CHARSET_PROBERS
#define NUM_OF_CHARSET_PROBERS
Definition: nsUniversalDetector.h:31
kencodingprober::nsUniversalDetector::mDetectedCharset
const char * mDetectedCharset
Definition: nsUniversalDetector.h:58
kencodingprober::nsUniversalDetector::mBestGuess
int mBestGuess
Definition: nsUniversalDetector.h:59
kencodingprober::nsUniversalDetector::mInputState
nsInputState mInputState
Definition: nsUniversalDetector.h:49
MINIMUM_THRESHOLD
#define MINIMUM_THRESHOLD
Definition: nsUniversalDetector.cpp:83
kencodingprober::ePureAscii
Definition: nsUniversalDetector.h:35
nsMBCSGroupProber.h
kencodingprober::nsUniversalDetector::mLastChar
char mLastChar
Definition: nsUniversalDetector.h:57
kencodingprober::eHighbyte
Definition: nsUniversalDetector.h:37
kencodingprober::nsProbingState
nsProbingState
Definition: nsCharSetProber.h:34
nsSBCSGroupProber.h
nsLatin1Prober.h
kencodingprober::nsCharSetProber::GetCharSetName
virtual const char * GetCharSetName()=0
kencodingprober::eDetecting
Definition: nsCharSetProber.h:35
kencodingprober::nsUniversalDetector::~nsUniversalDetector
virtual ~nsUniversalDetector()
Definition: nsUniversalDetector.cpp:52
kencodingprober::eEscAscii
Definition: nsUniversalDetector.h:36
kencodingprober::nsEscCharSetProber
Definition: nsEscCharsetProber.h:34
kencodingprober::nsMBCSGroupProber
Definition: nsMBCSGroupProber.h:39
kencodingprober::nsSBCSGroupProber
Definition: nsSBCSGroupProber.h:47
kencodingprober::nsUniversalDetector::mInTag
bool mInTag
Definition: nsUniversalDetector.h:54
kencodingprober::nsUniversalDetector::mDone
bool mDone
Definition: nsUniversalDetector.h:53
kencodingprober::nsUniversalDetector::nsUniversalDetector
nsUniversalDetector()
Definition: nsUniversalDetector.cpp:34
kencodingprober::nsUniversalDetector::HandleData
nsProbingState HandleData(const char *aBuf, unsigned int aLen)
Definition: nsUniversalDetector.cpp:85
This file is part of the KDE documentation.
Documentation copyright © 1996-2014 The KDE developers.
Generated on Tue Oct 14 2014 22:47:09 by doxygen 1.8.7 written by Dimitri van Heesch, © 1997-2006

KDE's Doxygen guidelines are available online.

KDECore

Skip menu "KDECore"
  • Main Page
  • Namespace List
  • Namespace Members
  • Alphabetical List
  • Class List
  • Class Hierarchy
  • Class Members
  • File List
  • File Members
  • Modules
  • Related Pages

kdelibs API Reference

Skip menu "kdelibs API Reference"
  • DNSSD
  • Interfaces
  •   KHexEdit
  •   KMediaPlayer
  •   KSpeech
  •   KTextEditor
  • kconf_update
  • KDE3Support
  •   KUnitTest
  • KDECore
  • KDED
  • KDEsu
  • KDEUI
  • KDEWebKit
  • KDocTools
  • KFile
  • KHTML
  • KImgIO
  • KInit
  • kio
  • KIOSlave
  • KJS
  •   KJS-API
  • kjsembed
  •   WTF
  • KNewStuff
  • KParts
  • KPty
  • Kross
  • KUnitConversion
  • KUtils
  • Nepomuk
  • Nepomuk-Core
  • Nepomuk
  • Plasma
  • Solid
  • Sonnet
  • ThreadWeaver

Search



Report problems with this website to our bug tracking system.
Contact the specific authors with questions and comments about the page contents.

KDE® and the K Desktop Environment® logo are registered trademarks of KDE e.V. | Legal