• Skip to content
  • Skip to link menu
KDE API Reference
  • KDE API Reference
  • kdelibs API Reference
  • KDE Home
  • Contact Us
 

KDECore

  • sources
  • kde-4.12
  • kdelibs
  • kdecore
  • localization
  • probers
CharDistribution.h
Go to the documentation of this file.
1 /* -*- Mode: C++; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
2 /* -*- C++ -*-
3  * Copyright (C) 1998 <developer@mozilla.org>
4  *
5  *
6  * Permission is hereby granted, free of charge, to any person obtaining
7  * a copy of this software and associated documentation files (the
8  * "Software"), to deal in the Software without restriction, including
9  * without limitation the rights to use, copy, modify, merge, publish,
10  * distribute, sublicense, and/or sell copies of the Software, and to
11  * permit persons to whom the Software is furnished to do so, subject to
12  * the following conditions:
13  *
14  * The above copyright notice and this permission notice shall be included
15  * in all copies or substantial portions of the Software.
16  *
17  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
18  * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
19  * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
20  * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
21  * LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
22  * OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
23  * WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
24  */
25 
26 #ifndef CharDistribution_h__
27 #define CharDistribution_h__
28 
29 #include "kdemacros.h"
30 
31 #define ENOUGH_DATA_THRESHOLD 256
32 
33 namespace kencodingprober {
34 class KDE_NO_EXPORT CharDistributionAnalysis
35 {
36 public:
37  CharDistributionAnalysis() {Reset();};
38  virtual ~CharDistributionAnalysis() {};
39 
40  //feed a block of data and do distribution analysis
41  void HandleData(const char* /* aBuf */, unsigned int /* aLen */) {};
42 
43  //Feed a character with known length
44  void HandleOneChar(const char* aStr, unsigned int aCharLen)
45  {
46  int order;
47 
48  //we only care about 2-bytes character in our distribution analysis
49  order = (aCharLen == 2) ? GetOrder(aStr) : -1;
50 
51  if (order >= 0)
52  {
53  mTotalChars++;
54  //order is valid
55  if ((unsigned int)order < mTableSize)
56  {
57  if (512 > mCharToFreqOrder[order])
58  mFreqChars++;
59  }
60  }
61  };
62 
63  //return confidence base on existing data
64  float GetConfidence();
65 
66  //Reset analyser, clear any state
67  void Reset(void)
68  {
69  mDone = false;
70  mTotalChars = 0;
71  mFreqChars = 0;
72  };
73 
74  //This function is for future extension. Caller can use this function to control
75  //analyser's behavior
76  void SetOpion(){};
77 
78  //It is not necessary to receive all data to draw conclusion. For charset detection,
79  // certain amount of data is enough
80  bool GotEnoughData() {return mTotalChars > ENOUGH_DATA_THRESHOLD;};
81 
82 protected:
83  //we do not handle character base on its original encoding string, but
84  //convert this encoding string to a number, here called order.
85  //This allow multiple encoding of a language to share one frequency table
86  virtual int GetOrder(const char* /* str */) {return -1;};
87 
88  //If this flag is set to true, detection is done and conclusion has been made
89  bool mDone;
90 
91  //The number of characters whose frequency order is less than 512
92  unsigned int mFreqChars;
93 
94  //Total character encounted.
95  unsigned int mTotalChars;
96 
97  //Mapping table to get frequency order from char order (get from GetOrder())
98  const short *mCharToFreqOrder;
99 
100  //Size of above table
101  unsigned int mTableSize;
102 
103  //This is a constant value varies from language to language, it is used in
104  //calculating confidence. See my paper for further detail.
105  float mTypicalDistributionRatio;
106 };
107 
108 
109 class KDE_NO_EXPORT EUCTWDistributionAnalysis: public CharDistributionAnalysis
110 {
111 public:
112  EUCTWDistributionAnalysis();
113 protected:
114 
115  //for euc-TW encoding, we are interested
116  // first byte range: 0xc4 -- 0xfe
117  // second byte range: 0xa1 -- 0xfe
118  //no validation needed here. State machine has done that
119  int GetOrder(const char* str)
120  { if ((unsigned char)*str >= (unsigned char)0xc4)
121  return 94*((unsigned char)str[0]-(unsigned char)0xc4) + (unsigned char)str[1] - (unsigned char)0xa1;
122  else
123  return -1;
124  };
125 };
126 
127 
128 class KDE_NO_EXPORT EUCKRDistributionAnalysis : public CharDistributionAnalysis
129 {
130 public:
131  EUCKRDistributionAnalysis();
132 protected:
133  //for euc-KR encoding, we are interested
134  // first byte range: 0xb0 -- 0xfe
135  // second byte range: 0xa1 -- 0xfe
136  //no validation needed here. State machine has done that
137  int GetOrder(const char* str)
138  { if ((unsigned char)*str >= (unsigned char)0xb0)
139  return 94*((unsigned char)str[0]-(unsigned char)0xb0) + (unsigned char)str[1] - (unsigned char)0xa1;
140  else
141  return -1;
142  };
143 };
144 
145 class KDE_NO_EXPORT GB2312DistributionAnalysis : public CharDistributionAnalysis
146 {
147 public:
148  GB2312DistributionAnalysis();
149 protected:
150  //for GB2312 encoding, we are interested
151  // first byte range: 0xb0 -- 0xfe
152  // second byte range: 0xa1 -- 0xfe
153  //no validation needed here. State machine has done that
154  int GetOrder(const char* str)
155  { if ((unsigned char)*str >= (unsigned char)0xb0 && (unsigned char)str[1] >= (unsigned char)0xa1)
156  return 94*((unsigned char)str[0]-(unsigned char)0xb0) + (unsigned char)str[1] - (unsigned char)0xa1;
157  else
158  return -1;
159  };
160 };
161 
162 
163 class KDE_NO_EXPORT Big5DistributionAnalysis : public CharDistributionAnalysis
164 {
165 public:
166  Big5DistributionAnalysis();
167 protected:
168  //for big5 encoding, we are interested
169  // first byte range: 0xa4 -- 0xfe
170  // second byte range: 0x40 -- 0x7e , 0xa1 -- 0xfe
171  //no validation needed here. State machine has done that
172  int GetOrder(const char* str)
173  { if ((unsigned char)*str >= (unsigned char)0xa4)
174  if ((unsigned char)str[1] >= (unsigned char)0xa1)
175  return 157*((unsigned char)str[0]-(unsigned char)0xa4) + (unsigned char)str[1] - (unsigned char)0xa1 +63;
176  else
177  return 157*((unsigned char)str[0]-(unsigned char)0xa4) + (unsigned char)str[1] - (unsigned char)0x40;
178  else
179  return -1;
180  };
181 };
182 
183 class KDE_NO_EXPORT SJISDistributionAnalysis : public CharDistributionAnalysis
184 {
185 public:
186  SJISDistributionAnalysis();
187 protected:
188  //for sjis encoding, we are interested
189  // first byte range: 0x81 -- 0x9f , 0xe0 -- 0xfe
190  // second byte range: 0x40 -- 0x7e, 0x81 -- oxfe
191  //no validation needed here. State machine has done that
192  int GetOrder(const char* str)
193  {
194  int order;
195  if ((unsigned char)*str >= (unsigned char)0x81 && (unsigned char)*str <= (unsigned char)0x9f)
196  order = 188 * ((unsigned char)str[0]-(unsigned char)0x81);
197  else if ((unsigned char)*str >= (unsigned char)0xe0 && (unsigned char)*str <= (unsigned char)0xef)
198  order = 188 * ((unsigned char)str[0]-(unsigned char)0xe0 + 31);
199  else
200  return -1;
201  order += (unsigned char)*(str+1) - 0x40;
202  if ((unsigned char)str[1] > (unsigned char)0x7f)
203  order--;
204  return order;
205  };
206 };
207 
208 class KDE_NO_EXPORT EUCJPDistributionAnalysis : public CharDistributionAnalysis
209 {
210 public:
211  EUCJPDistributionAnalysis();
212 protected:
213  //for euc-JP encoding, we are interested
214  // first byte range: 0xa0 -- 0xfe
215  // second byte range: 0xa1 -- 0xfe
216  //no validation needed here. State machine has done that
217  int GetOrder(const char* str)
218  { if ((unsigned char)*str >= (unsigned char)0xa0)
219  return 94*((unsigned char)str[0]-(unsigned char)0xa1) + (unsigned char)str[1] - (unsigned char)0xa1;
220  else
221  return -1;
222  };
223 };
224 }
225 #endif //CharDistribution_h__
226 
kencodingprober::CharDistributionAnalysis::mTypicalDistributionRatio
float mTypicalDistributionRatio
Definition: CharDistribution.h:105
kencodingprober::CharDistributionAnalysis::mFreqChars
unsigned int mFreqChars
Definition: CharDistribution.h:92
kencodingprober::EUCKRDistributionAnalysis
Definition: CharDistribution.h:128
KDE_NO_EXPORT
#define KDE_NO_EXPORT
The KDE_NO_EXPORT macro marks the symbol of the given variable to be hidden.
Definition: kdemacros.h.cmake:73
ENOUGH_DATA_THRESHOLD
#define ENOUGH_DATA_THRESHOLD
Definition: CharDistribution.h:31
kencodingprober::EUCTWDistributionAnalysis::GetOrder
int GetOrder(const char *str)
Definition: CharDistribution.h:119
kencodingprober::CharDistributionAnalysis::mTableSize
unsigned int mTableSize
Definition: CharDistribution.h:101
kencodingprober::GB2312DistributionAnalysis
Definition: CharDistribution.h:145
kencodingprober::EUCJPDistributionAnalysis::GetOrder
int GetOrder(const char *str)
Definition: CharDistribution.h:217
kencodingprober::CharDistributionAnalysis::GotEnoughData
bool GotEnoughData()
Definition: CharDistribution.h:80
kencodingprober::Big5DistributionAnalysis::GetOrder
int GetOrder(const char *str)
Definition: CharDistribution.h:172
kencodingprober::GB2312DistributionAnalysis::GetOrder
int GetOrder(const char *str)
Definition: CharDistribution.h:154
kencodingprober::SJISDistributionAnalysis::GetOrder
int GetOrder(const char *str)
Definition: CharDistribution.h:192
kencodingprober::EUCTWDistributionAnalysis
Definition: CharDistribution.h:109
kencodingprober::Big5DistributionAnalysis
Definition: CharDistribution.h:163
kencodingprober::EUCJPDistributionAnalysis
Definition: CharDistribution.h:208
kencodingprober::EUCKRDistributionAnalysis::GetOrder
int GetOrder(const char *str)
Definition: CharDistribution.h:137
kencodingprober::CharDistributionAnalysis::SetOpion
void SetOpion()
Definition: CharDistribution.h:76
kencodingprober::SJISDistributionAnalysis
Definition: CharDistribution.h:183
kencodingprober::CharDistributionAnalysis::mTotalChars
unsigned int mTotalChars
Definition: CharDistribution.h:95
kencodingprober::CharDistributionAnalysis::HandleData
void HandleData(const char *, unsigned int)
Definition: CharDistribution.h:41
kencodingprober::CharDistributionAnalysis::CharDistributionAnalysis
CharDistributionAnalysis()
Definition: CharDistribution.h:37
kencodingprober::CharDistributionAnalysis
Definition: CharDistribution.h:34
kencodingprober::CharDistributionAnalysis::Reset
void Reset(void)
Definition: CharDistribution.h:67
kencodingprober::CharDistributionAnalysis::mCharToFreqOrder
const short * mCharToFreqOrder
Definition: CharDistribution.h:98
kencodingprober::CharDistributionAnalysis::~CharDistributionAnalysis
virtual ~CharDistributionAnalysis()
Definition: CharDistribution.h:38
kencodingprober::CharDistributionAnalysis::HandleOneChar
void HandleOneChar(const char *aStr, unsigned int aCharLen)
Definition: CharDistribution.h:44
This file is part of the KDE documentation.
Documentation copyright © 1996-2014 The KDE developers.
Generated on Tue Oct 14 2014 22:47:07 by doxygen 1.8.7 written by Dimitri van Heesch, © 1997-2006

KDE's Doxygen guidelines are available online.

KDECore

Skip menu "KDECore"
  • Main Page
  • Namespace List
  • Namespace Members
  • Alphabetical List
  • Class List
  • Class Hierarchy
  • Class Members
  • File List
  • File Members
  • Modules
  • Related Pages

kdelibs API Reference

Skip menu "kdelibs API Reference"
  • DNSSD
  • Interfaces
  •   KHexEdit
  •   KMediaPlayer
  •   KSpeech
  •   KTextEditor
  • kconf_update
  • KDE3Support
  •   KUnitTest
  • KDECore
  • KDED
  • KDEsu
  • KDEUI
  • KDEWebKit
  • KDocTools
  • KFile
  • KHTML
  • KImgIO
  • KInit
  • kio
  • KIOSlave
  • KJS
  •   KJS-API
  • kjsembed
  •   WTF
  • KNewStuff
  • KParts
  • KPty
  • Kross
  • KUnitConversion
  • KUtils
  • Nepomuk
  • Nepomuk-Core
  • Nepomuk
  • Plasma
  • Solid
  • Sonnet
  • ThreadWeaver

Search



Report problems with this website to our bug tracking system.
Contact the specific authors with questions and comments about the page contents.

KDE® and the K Desktop Environment® logo are registered trademarks of KDE e.V. | Legal