• Skip to content
  • Skip to link menu
KDE API Reference
  • KDE API Reference
  • kdeedu API Reference
  • KDE Home
  • Contact Us
 

kiten/lib

  • sources
  • kde-4.14
  • kdeedu
  • kiten
  • lib
  • DictEdict
indexededictfile.cpp
Go to the documentation of this file.
1 /*****************************************************************************
2  * This file is part of Kiten, a KDE Japanese Reference Tool *
3  * Copyright (C) 2001 Jason Katz-Brown <jason@katzbrown.com> *
4  * Copyright (C) 2008 Joseph Kerian <jkerian@gmail.com> *
5  * *
6  * This library is free software; you can redistribute it and/or *
7  * modify it under the terms of the GNU Library General Public *
8  * License as published by the Free Software Foundation; either *
9  * version 2 of the License, or (at your option) any later version. *
10  * *
11  * This library is distributed in the hope that it will be useful, *
12  * but WITHOUT ANY WARRANTY; without even the implied warranty of *
13  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU *
14  * Library General Public License for more details. *
15  * *
16  * You should have received a copy of the GNU Library General Public License *
17  * along with this library; see the file COPYING.LIB. If not, write to *
18  * the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, *
19  * Boston, MA 02110-1301, USA. *
20  *****************************************************************************/
21 
22 #include "indexededictfile.h"
23 
24 #include <KApplication>
25 #include <KDebug>
26 #include <KGlobal>
27 #include <KProcess>
28 #include <KStandardDirs>
29 
30 #include <QFile>
31 #include <QFileInfo>
32 #include <QString>
33 #include <QTextCodec>
34 #include <QVector>
35 
36 #include <sys/mman.h>
37 
38 IndexedEdictFile::IndexedEdictFile()
39 : m_valid( false )
40 , m_dictPtr( static_cast<unsigned char*>( MAP_FAILED ) )
41 , m_indexPtr( static_cast<uint32_t*>( MAP_FAILED ) )
42 {
43 }
44 
45 IndexedEdictFile::~IndexedEdictFile()
46 {
47  if( m_valid )
48  {
49  munmap( static_cast<void*>( m_dictPtr ), m_dictFile.size() );
50  munmap( static_cast<void*>( m_indexPtr ), m_indexFile.size() );
51  m_dictFile.close();
52  m_indexFile.close();
53  }
54 }
55 
56 //Warning: This assumes that both files are CLOSED
57 bool IndexedEdictFile::buildIndex()
58 {
59  KProcess proc;
60  proc << KStandardDirs::findExe("kitengen") << m_dictFile.fileName() << m_indexFile.fileName();
61  proc.start();
62  proc.waitForStarted();
63 
64  do
65  {
66  KApplication::processEvents();
67  } while( proc.waitForFinished( 5000 ) ); //FIXME: This just cuts the index generator off after 5 sec
68 
69  //FIXME: Check for the result of this operation
70  return proc.exitStatus() == QProcess::NormalExit && proc.exitCode() == 0;
71 }
72 
73 //Warning: This assumes that both files have already been opened
74 bool IndexedEdictFile::checkIndex() const
75 {
76  //Verify the index file version and size
77  uint32_t dictionaryLength = static_cast<uint32_t>( m_dictFile.size() );
78  dictionaryLength++;
79  uint32_t indexVersionTest;
80 
81  if( 4 == m_indexFile.read( reinterpret_cast<char*>( &indexVersionTest ), 4 ) )
82  {
83  if( indexVersionTest == dictionaryLength + indexFileVersion )
84  {
85  return true;
86  }
87  }
88 
89  return false;
90 }
91 
100 int IndexedEdictFile::equalOrSubstring( const char *str1, const char *str2 ) const
101 {
102  for(unsigned i=0; ; ++i)
103  {
104  unsigned char c1 = static_cast<unsigned char>( str1[ i ] );
105  unsigned char c2 = static_cast<unsigned char>( str2[ i ] );
106 
107  if( c1 == '\0' )
108  {
109  return 0;
110  }
111 
112  if( ( i % 2 ) == 0 )
113  {
114  //on the highbyte (of kana)
115  if( c2 == 0xA5 ) //Make katakana and hiragana equivelent
116  {
117  c2 = 0xA4;
118  }
119  if( c1 == 0xA5 )
120  {
121  c1 = 0xA4;
122  }
123  }
124 
125  if( ( 'A' <= c1 ) && ( c1 <= 'Z' ) )
126  {
127  c1 |= 0x20; // 'fix' uppercase
128  }
129  if( ( 'A' <= c2 ) && ( c2 <= 'Z' ) )
130  {
131  c2 |= 0x20;
132  }
133 
134  if( c1 != c2 )
135  {
136  return (int)c2 - (int)c1;
137  }
138  }
139 
140  return 0; //silly compiler requirements
141 }
142 
147 uint32_t IndexedEdictFile::findFirstMatch( const QByteArray &query ) const
148 {
149  int low = 0;
150  int high = m_indexFile.size() / sizeof( uint32_t ) - 1;
151  int cur;
152  int comp = 0;
153 
154  do
155  {
156  cur = ( high + low ) / 2;
157  comp = equalOrSubstring( query, lookupDictLine( cur ) );
158  if( comp < 0 )
159  {
160  low = cur + 1;
161  }
162  if( comp > 0 )
163  {
164  high = cur - 1;
165  }
166  } while( high >= low && comp != 0 && ! ( high == 0 && low == 0 ) );
167 
168  if( comp != 0 )
169  {
170  return 0;
171  }
172 
173  while( cur - 1 && 0 == equalOrSubstring( query,lookupDictLine( cur ) ) )
174  {
175  --cur;
176  }
177 
178  return cur;
179 }
180 
181 QVector<QString> IndexedEdictFile::findMatches( const QString &query ) const
182 {
183  QVector<QString> results;
184  if( ! m_valid )
185  {
186  return results;
187  }
188 
189  QTextCodec *codec = QTextCodec::codecForName( "eucJP" );
190  if( ! codec )
191  {
192  return results;
193  }
194 
195  QByteArray searchString = codec->fromUnicode( query );
196  int indexSize = m_indexFile.size() / sizeof( uint32_t );
197  int dictSize = m_dictFile.size() / sizeof( unsigned char );
198 
199  int matchLocation = findFirstMatch( searchString );
200  QByteArray currentWord = lookupDictLine( ++matchLocation );
201  if( matchLocation == 0 )
202  {
203  return results;
204  }
205 
206  QVector<uint32_t> possibleHits;
207 
208  do
209  {
210  currentWord = lookupDictLine( ++matchLocation );
211  int i = 0;
212  while( lookupDictChar( m_indexPtr[ matchLocation - 1 ] + i - 2 ) != 0x0A )
213  {
214  --i;
215  }
216  possibleHits.push_back( m_indexPtr[ matchLocation - 1 ] + i - 1 );
217  } while( matchLocation < indexSize && 0 == equalOrSubstring( searchString, currentWord ) );
218 
219  if( possibleHits.size() <= 0 )
220  {
221  return results;
222  }
223 
224  qSort( possibleHits );
225  uint32_t last = 0;
226 
227  foreach( uint32_t it, possibleHits )
228  {
229  if(last != it)
230  {
231  last = it;
232  results.push_back( codec->toUnicode( lookupFullLine( it ) ) );
233  }
234  }
235 
236  return results;
237 }
238 
247 int IndexedEdictFile::findMatches( const char *str1, const char *str2 ) const
248 {
249 #define EUC_LATIN_CHARACTER(x) (('a'<=x && x<='z')||(x==0xA4)||(x==0x80))
250 
251  for(unsigned i=0; ; ++i)
252  {
253  unsigned char c1 = static_cast<unsigned char>( str1[ i ] );
254  unsigned char c2 = static_cast<unsigned char>( str2[ i ] );
255 
256  if( ( i % 2 ) == 0 )
257  {
258  //on the highbyte (of kana)
259  if( c2 == 0xA5 ) //Make katakana and hiragana equivelent
260  {
261  c2 = 0xA4;
262  }
263 
264  if( c1 == 0xA5 )
265  {
266  c1 = 0xA4;
267  }
268  }
269 
270  if( ( 'A' <= c1 ) && ( c1 <= 'Z' ) )
271  {
272  c1 |= 0x20; // 'fix' uppercase
273  }
274  if( ( 'A' <= c2 ) && ( c2 <= 'Z' ) )
275  {
276  c2 |= 0x20;
277  }
278 
279  if( c1 == '\0' )
280  {
281  if( ! EUC_LATIN_CHARACTER( c2 ) )
282  {
283  return 0;
284  }
285 
286  return c2;
287  }
288 
289  if( c1 != c2 )
290  {
291  return (int)c2 - (int)c1;
292  }
293  }
294 
295  return 0; //shouldn't happen... but gcc will warn if this isn't here
296 }
297 
298 bool IndexedEdictFile::loadFile( const QString &fileName )
299 {
300  if( m_valid )
301  {
302  return false;
303  }
304 
305  m_dictFile.setFileName( fileName );
306  if( ! m_dictFile.exists() )
307  {
308  return false; //Bail if the file doesn't exist
309  }
310 
311  m_dictPtr = static_cast<unsigned char*>( MAP_FAILED );
312  m_indexFile.setFileName( KGlobal::dirs()->saveLocation( "data", "kiten/xjdx/", true )
313  + QFileInfo( fileName ).baseName() + ".xjdx" );
314  m_indexPtr = static_cast<uint32_t*>( MAP_FAILED );
315  if( ! m_indexFile.exists() )
316  {
317  //If the index file isn't there, build it
318  //TODO: Verify the format if the index doesn't exist?
319  if( ! buildIndex() ) //If we can't build the file, bail
320  {
321  return false;
322  }
323  }
324 
325  if( ! m_dictFile.open( QIODevice::ReadOnly ) )
326  {
327  return false;
328  }
329 
330  if( m_indexFile.open( QIODevice::ReadOnly ) )
331  {
332  if( checkIndex() )
333  {
334  if( loadmmaps() )
335  {
336  m_valid = true;
337  return true;
338  }
339  }
340 
341  m_indexFile.close();
342  }
343 
344  //Success is actually in the middle of that if statement, so if we get here
345  //something failed and we need to clean up
346  m_dictFile.close();
347  return false;
348 }
349 
350 //Warning: This assumes that both files have already been opened
351 bool IndexedEdictFile::loadmmaps()
352 {
353  m_indexPtr = static_cast<uint32_t*>(
354  mmap(0, m_indexFile.size(), PROT_READ, MAP_SHARED, m_indexFile.handle(), 0));
355  if( m_indexPtr == static_cast<uint32_t*>( MAP_FAILED ) )
356  {
357  return false;
358  }
359 
360  m_dictPtr = static_cast<unsigned char*>( mmap( 0
361  , m_dictFile.size()
362  , PROT_READ
363  , MAP_SHARED
364  , m_dictFile.handle()
365  , 0 ) );
366  if( m_dictPtr == static_cast<unsigned char*>( MAP_FAILED ) )
367  {
368  munmap( static_cast<void*>( m_indexPtr ), m_indexFile.size() );
369  m_indexPtr = static_cast<uint32_t*>( MAP_FAILED );
370  return false;
371  }
372 
373  return true;
374 }
375 
382 inline unsigned char IndexedEdictFile::lookupDictChar( uint32_t i ) const
383 {
384  if( i > static_cast<uint32_t>( m_dictFile.size() ) /*|| i < 0*/ )
385  {
386  return 0x0A; //If out of bounds, return endl
387  }
388 
389  return m_dictPtr[i];
390 }
391 
397 QByteArray IndexedEdictFile::lookupDictLine( uint32_t i ) const
398 {
399  if( i > static_cast<uint32_t>( m_dictFile.size()) /*|| i < 0*/ )
400  {
401  return QByteArray( "" );
402  }
403 
404  uint32_t start = m_indexPtr[ i ] - 1;
405  uint32_t pos = start;
406  const unsigned size = m_dictFile.size();
407  //Grab the whole word
408  //As long as we don't get EOF, null or newline... keep going forward
409  while( pos<=size && m_dictPtr[ pos ] != 0 && m_dictPtr[ pos ] != 0x0A )
410  {
411  ++pos;
412  }
413 
414  //Copy the word to a QCString
415  QByteArray retval( (const char*)( m_dictPtr + start )
416  , 1 + pos - start );
417  //and away we go
418  return retval;
419 }
420 
424 QByteArray IndexedEdictFile::lookupFullLine( uint32_t i ) const
425 {
426  if( i > static_cast<uint32_t>( m_dictFile.size() ) /*|| i < 0*/ )
427  {
428  return QByteArray (0x0A, 1 ); //If out of bounds, return endl
429  }
430 
431  uint32_t start = i;
432  uint32_t pos = start;
433  const unsigned max = m_dictFile.size();
434  while( pos <= max && m_dictPtr[ pos ] != 0 && m_dictPtr[ pos ] != 0x0A )
435  {
436  ++pos;
437  }
438 
439  QByteArray retval( (const char*)( m_dictPtr + start )
440  , 1 + pos - start );
441  //and away we go
442  return retval;
443 }
444 
445 bool IndexedEdictFile::valid() const
446 {
447  return m_valid;
448 }
QTextCodec::fromUnicode
QByteArray fromUnicode(const QString &str) const
QByteArray
IndexedEdictFile::findMatches
QVector< QString > findMatches(const QString &query) const
Get everything that looks remotely like a given search string.
Definition: indexededictfile.cpp:181
QFile::handle
int handle() const
QFile::fileName
QString fileName() const
QFile::setFileName
void setFileName(const QString &name)
IndexedEdictFile::loadFile
bool loadFile(const QString &fileName)
Load a file, generate the index if it doesn't already exist.
Definition: indexededictfile.cpp:298
QFile::exists
bool exists() const
indexededictfile.h
QIODevice::read
qint64 read(char *data, qint64 maxSize)
EUC_LATIN_CHARACTER
#define EUC_LATIN_CHARACTER(x)
QString
QTextCodec
QFile::open
virtual bool open(QFlags< QIODevice::OpenModeFlag > mode)
QFileInfo
IndexedEdictFile::IndexedEdictFile
IndexedEdictFile()
Create and initialize this object.
Definition: indexededictfile.cpp:38
QFile::size
virtual qint64 size() const
QFile::close
virtual void close()
IndexedEdictFile::~IndexedEdictFile
~IndexedEdictFile()
Definition: indexededictfile.cpp:45
QVector
IndexedEdictFile::valid
bool valid() const
Test if the file was properly loaded.
Definition: indexededictfile.cpp:445
QTextCodec::codecForName
QTextCodec * codecForName(const QByteArray &name)
QVector::push_back
void push_back(const T &value)
QVector::size
int size() const
QTextCodec::toUnicode
QString toUnicode(const QByteArray &a) const
uint32_t
unsigned int uint32_t
Definition: indexededictfile.h:33
This file is part of the KDE documentation.
Documentation copyright © 1996-2020 The KDE developers.
Generated on Mon Jun 22 2020 13:16:38 by doxygen 1.8.7 written by Dimitri van Heesch, © 1997-2006

KDE's Doxygen guidelines are available online.

kiten/lib

Skip menu "kiten/lib"
  • Main Page
  • Namespace List
  • Namespace Members
  • Alphabetical List
  • Class List
  • Class Hierarchy
  • Class Members
  • File List
  • File Members
  • Related Pages

kdeedu API Reference

Skip menu "kdeedu API Reference"
  • Analitza
  •     lib
  • kalgebra
  • kalzium
  •   libscience
  • kanagram
  • kig
  •   lib
  • klettres
  • marble
  • parley
  • rocs
  •   App
  •   RocsCore
  •   VisualEditor
  •   stepcore

Search



Report problems with this website to our bug tracking system.
Contact the specific authors with questions and comments about the page contents.

KDE® and the K Desktop Environment® logo are registered trademarks of KDE e.V. | Legal