• Skip to content
  • Skip to link menu
KDE 4.3 API Reference
  • KDE API Reference
  • kdelibs
  • Sitemap
  • Contact Us
 

KDECore

CharDistribution.h

Go to the documentation of this file.
00001 /* -*- Mode: C++; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
00002 /*  -*- C++ -*-
00003  *  Copyright (C) 1998 <developer@mozilla.org>
00004  *
00005  *
00006  *  Permission is hereby granted, free of charge, to any person obtaining
00007  *  a copy of this software and associated documentation files (the
00008  *  "Software"), to deal in the Software without restriction, including
00009  *  without limitation the rights to use, copy, modify, merge, publish,
00010  *  distribute, sublicense, and/or sell copies of the Software, and to
00011  *  permit persons to whom the Software is furnished to do so, subject to
00012  *  the following conditions:
00013  *
00014  *  The above copyright notice and this permission notice shall be included 
00015  *  in all copies or substantial portions of the Software.
00016  *
00017  *  THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
00018  *  EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
00019  *  MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
00020  *  NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
00021  *  LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
00022  *  OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
00023  *  WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
00024  */
00025 
00026 #ifndef CharDistribution_h__
00027 #define CharDistribution_h__
00028 
00029 #include "kdemacros.h"
00030 
00031 #define ENOUGH_DATA_THRESHOLD 256
00032  
00033 namespace kencodingprober {
00034 class KDE_NO_EXPORT  CharDistributionAnalysis
00035 {
00036 public:
00037   CharDistributionAnalysis() {Reset();};
00038   virtual ~CharDistributionAnalysis() {};
00039 
00040   //feed a block of data and do distribution analysis
00041   void HandleData(const char* /* aBuf */, unsigned int /* aLen */) {};
00042   
00043   //Feed a character with known length
00044   void HandleOneChar(const char* aStr, unsigned int aCharLen)
00045   {
00046     int order;
00047 
00048     //we only care about 2-bytes character in our distribution analysis
00049     order = (aCharLen == 2) ? GetOrder(aStr) : -1;
00050 
00051     if (order >= 0)
00052     {
00053       mTotalChars++;
00054       //order is valid
00055       if ((unsigned int)order < mTableSize)
00056       {
00057         if (512 > mCharToFreqOrder[order])
00058           mFreqChars++;
00059       }
00060     }
00061   };
00062 
00063   //return confidence base on existing data
00064   float GetConfidence();
00065 
00066   //Reset analyser, clear any state 
00067   void      Reset(void) 
00068   {
00069     mDone = false;
00070     mTotalChars = 0;
00071     mFreqChars = 0;
00072   };
00073 
00074   //This function is for future extension. Caller can use this function to control
00075   //analyser's behavior
00076   void      SetOpion(){};
00077 
00078   //It is not necessary to receive all data to draw conclusion. For charset detection,
00079   // certain amount of data is enough
00080   bool GotEnoughData() {return mTotalChars > ENOUGH_DATA_THRESHOLD;};
00081 
00082 protected:
00083   //we do not handle character base on its original encoding string, but 
00084   //convert this encoding string to a number, here called order.
00085   //This allow multiple encoding of a language to share one frequency table 
00086   virtual int GetOrder(const char* /* str */) {return -1;};
00087   
00088   //If this flag is set to true, detection is done and conclusion has been made
00089   bool   mDone;
00090 
00091   //The number of characters whose frequency order is less than 512
00092   unsigned int mFreqChars;
00093 
00094   //Total character encounted.
00095   unsigned int mTotalChars;
00096 
00097   //Mapping table to get frequency order from char order (get from GetOrder())
00098   const short  *mCharToFreqOrder;
00099 
00100   //Size of above table
00101   unsigned int mTableSize;
00102 
00103   //This is a constant value varies from language to language, it is used in 
00104   //calculating confidence. See my paper for further detail.
00105   float    mTypicalDistributionRatio;
00106 };
00107 
00108 
00109 class KDE_NO_EXPORT  EUCTWDistributionAnalysis: public CharDistributionAnalysis
00110 {
00111 public:
00112   EUCTWDistributionAnalysis();
00113 protected:
00114 
00115   //for euc-TW encoding, we are interested 
00116   //  first  byte range: 0xc4 -- 0xfe
00117   //  second byte range: 0xa1 -- 0xfe
00118   //no validation needed here. State machine has done that
00119   int GetOrder(const char* str) 
00120   { if ((unsigned char)*str >= (unsigned char)0xc4)  
00121       return 94*((unsigned char)str[0]-(unsigned char)0xc4) + (unsigned char)str[1] - (unsigned char)0xa1;
00122     else
00123       return -1;
00124   };
00125 };
00126 
00127 
00128 class KDE_NO_EXPORT  EUCKRDistributionAnalysis : public CharDistributionAnalysis
00129 {
00130 public:
00131   EUCKRDistributionAnalysis();
00132 protected:
00133   //for euc-KR encoding, we are interested 
00134   //  first  byte range: 0xb0 -- 0xfe
00135   //  second byte range: 0xa1 -- 0xfe
00136   //no validation needed here. State machine has done that
00137   int GetOrder(const char* str) 
00138   { if ((unsigned char)*str >= (unsigned char)0xb0)  
00139       return 94*((unsigned char)str[0]-(unsigned char)0xb0) + (unsigned char)str[1] - (unsigned char)0xa1;
00140     else
00141       return -1;
00142   };
00143 };
00144 
00145 class KDE_NO_EXPORT  GB2312DistributionAnalysis : public CharDistributionAnalysis
00146 {
00147 public:
00148   GB2312DistributionAnalysis();
00149 protected:
00150   //for GB2312 encoding, we are interested 
00151   //  first  byte range: 0xb0 -- 0xfe
00152   //  second byte range: 0xa1 -- 0xfe
00153   //no validation needed here. State machine has done that
00154   int GetOrder(const char* str) 
00155   { if ((unsigned char)*str >= (unsigned char)0xb0 && (unsigned char)str[1] >= (unsigned char)0xa1)  
00156       return 94*((unsigned char)str[0]-(unsigned char)0xb0) + (unsigned char)str[1] - (unsigned char)0xa1;
00157     else
00158       return -1;
00159   };
00160 };
00161 
00162 
00163 class KDE_NO_EXPORT  Big5DistributionAnalysis : public CharDistributionAnalysis
00164 {
00165 public:
00166   Big5DistributionAnalysis();
00167 protected:
00168   //for big5 encoding, we are interested 
00169   //  first  byte range: 0xa4 -- 0xfe
00170   //  second byte range: 0x40 -- 0x7e , 0xa1 -- 0xfe
00171   //no validation needed here. State machine has done that
00172   int GetOrder(const char* str) 
00173   { if ((unsigned char)*str >= (unsigned char)0xa4)  
00174       if ((unsigned char)str[1] >= (unsigned char)0xa1)
00175         return 157*((unsigned char)str[0]-(unsigned char)0xa4) + (unsigned char)str[1] - (unsigned char)0xa1 +63;
00176       else
00177         return 157*((unsigned char)str[0]-(unsigned char)0xa4) + (unsigned char)str[1] - (unsigned char)0x40;
00178     else
00179       return -1;
00180   };
00181 };
00182 
00183 class KDE_NO_EXPORT  SJISDistributionAnalysis : public CharDistributionAnalysis
00184 {
00185 public:
00186   SJISDistributionAnalysis();
00187 protected:
00188   //for sjis encoding, we are interested 
00189   //  first  byte range: 0x81 -- 0x9f , 0xe0 -- 0xfe
00190   //  second byte range: 0x40 -- 0x7e,  0x81 -- oxfe
00191   //no validation needed here. State machine has done that
00192   int GetOrder(const char* str) 
00193   { 
00194     int order;
00195     if ((unsigned char)*str >= (unsigned char)0x81 && (unsigned char)*str <= (unsigned char)0x9f)  
00196       order = 188 * ((unsigned char)str[0]-(unsigned char)0x81);
00197     else if ((unsigned char)*str >= (unsigned char)0xe0 && (unsigned char)*str <= (unsigned char)0xef)  
00198       order = 188 * ((unsigned char)str[0]-(unsigned char)0xe0 + 31);
00199     else
00200       return -1;
00201     order += (unsigned char)*(str+1) - 0x40;
00202     if ((unsigned char)str[1] > (unsigned char)0x7f)
00203       order--;
00204     return order;
00205   };
00206 };
00207 
00208 class KDE_NO_EXPORT  EUCJPDistributionAnalysis : public CharDistributionAnalysis
00209 {
00210 public:
00211   EUCJPDistributionAnalysis();
00212 protected:
00213   //for euc-JP encoding, we are interested 
00214   //  first  byte range: 0xa0 -- 0xfe
00215   //  second byte range: 0xa1 -- 0xfe
00216   //no validation needed here. State machine has done that
00217   int GetOrder(const char* str) 
00218   { if ((unsigned char)*str >= (unsigned char)0xa0)  
00219       return 94*((unsigned char)str[0]-(unsigned char)0xa1) + (unsigned char)str[1] - (unsigned char)0xa1;
00220     else
00221       return -1;
00222   };
00223 };
00224 }
00225 #endif //CharDistribution_h__
00226 

KDECore

Skip menu "KDECore"
  • Main Page
  • Modules
  • Namespace List
  • Class Hierarchy
  • Alphabetical List
  • Class List
  • File List
  • Namespace Members
  • Class Members
  • Related Pages

kdelibs

Skip menu "kdelibs"
  • DNSSD
  • Interfaces
  •   KHexEdit
  •   KMediaPlayer
  •   KSpeech
  •   KTextEditor
  • Kate
  • kconf_update
  • KDE3Support
  •   KUnitTest
  • KDECore
  • KDED
  • KDEsu
  • KDEUI
  • KDocTools
  • KFile
  • KHTML
  • KImgIO
  • KInit
  • kio
  • KIOSlave
  • KJS
  •   KJS-API
  •   WTF
  • kjsembed
  • KNewStuff
  • KParts
  • KPty
  • Kross
  • KUtils
  • Nepomuk
  • Plasma
  • Solid
  • Sonnet
  • ThreadWeaver
Generated for kdelibs by doxygen 1.6.1
This website is maintained by Adriaan de Groot and Allen Winter.
KDE® and the K Desktop Environment® logo are registered trademarks of KDE e.V. | Legal