• Skip to content
  • Skip to link menu
KDE 3.5 API Reference
  • KDE API Reference
  • API Reference
  • Sitemap
  • Contact Us
 

kviewshell

DjVuText.cpp

Go to the documentation of this file.
00001 //C-  -*- C++ -*-
00002 //C- -------------------------------------------------------------------
00003 //C- DjVuLibre-3.5
00004 //C- Copyright (c) 2002  Leon Bottou and Yann Le Cun.
00005 //C- Copyright (c) 2001  AT&T
00006 //C-
00007 //C- This software is subject to, and may be distributed under, the
00008 //C- GNU General Public License, Version 2. The license should have
00009 //C- accompanied the software or you may obtain a copy of the license
00010 //C- from the Free Software Foundation at http://www.fsf.org .
00011 //C-
00012 //C- This program is distributed in the hope that it will be useful,
00013 //C- but WITHOUT ANY WARRANTY; without even the implied warranty of
00014 //C- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
00015 //C- GNU General Public License for more details.
00016 //C- 
00017 //C- DjVuLibre-3.5 is derived from the DjVu(r) Reference Library
00018 //C- distributed by Lizardtech Software.  On July 19th 2002, Lizardtech 
00019 //C- Software authorized us to replace the original DjVu(r) Reference 
00020 //C- Library notice by the following text (see doc/lizard2002.djvu):
00021 //C-
00022 //C-  ------------------------------------------------------------------
00023 //C- | DjVu (r) Reference Library (v. 3.5)
00024 //C- | Copyright (c) 1999-2001 LizardTech, Inc. All Rights Reserved.
00025 //C- | The DjVu Reference Library is protected by U.S. Pat. No.
00026 //C- | 6,058,214 and patents pending.
00027 //C- |
00028 //C- | This software is subject to, and may be distributed under, the
00029 //C- | GNU General Public License, Version 2. The license should have
00030 //C- | accompanied the software or you may obtain a copy of the license
00031 //C- | from the Free Software Foundation at http://www.fsf.org .
00032 //C- |
00033 //C- | The computer code originally released by LizardTech under this
00034 //C- | license and unmodified by other parties is deemed "the LIZARDTECH
00035 //C- | ORIGINAL CODE."  Subject to any third party intellectual property
00036 //C- | claims, LizardTech grants recipient a worldwide, royalty-free, 
00037 //C- | non-exclusive license to make, use, sell, or otherwise dispose of 
00038 //C- | the LIZARDTECH ORIGINAL CODE or of programs derived from the 
00039 //C- | LIZARDTECH ORIGINAL CODE in compliance with the terms of the GNU 
00040 //C- | General Public License.   This grant only confers the right to 
00041 //C- | infringe patent claims underlying the LIZARDTECH ORIGINAL CODE to 
00042 //C- | the extent such infringement is reasonably necessary to enable 
00043 //C- | recipient to make, have made, practice, sell, or otherwise dispose 
00044 //C- | of the LIZARDTECH ORIGINAL CODE (or portions thereof) and not to 
00045 //C- | any greater extent that may be necessary to utilize further 
00046 //C- | modifications or combinations.
00047 //C- |
00048 //C- | The LIZARDTECH ORIGINAL CODE is provided "AS IS" WITHOUT WARRANTY
00049 //C- | OF ANY KIND, EITHER EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED
00050 //C- | TO ANY WARRANTY OF NON-INFRINGEMENT, OR ANY IMPLIED WARRANTY OF
00051 //C- | MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE.
00052 //C- +------------------------------------------------------------------
00053 // 
00054 // $Id: DjVuText.cpp,v 1.10 2004/07/07 19:23:36 leonb Exp $
00055 // $Name: release_3_5_15 $
00056 
00057 #ifdef HAVE_CONFIG_H
00058 # include "config.h"
00059 #endif
00060 #if NEED_GNUG_PRAGMAS
00061 # pragma implementation
00062 #endif
00063 
00064 #include "DjVuText.h"
00065 #include "IFFByteStream.h"
00066 #include "BSByteStream.h"
00067 #include "debug.h"
00068 #include <ctype.h>
00069 
00070 
00071 
00072 #ifdef HAVE_NAMESPACES
00073 namespace DJVU {
00074 # ifdef NOT_DEFINED // Just to fool emacs c++ mode
00075 }
00076 #endif
00077 #endif
00078 
00079 
00080 
00081 #ifdef min
00082 #undef min
00083 #endif
00084 template<class TYPE>
00085 static inline TYPE min(TYPE a,TYPE b) { return (a<b)?a:b; }
00086 
00087 //***************************************************************************
00088 //******************************** DjVuTXT **********************************
00089 //***************************************************************************
00090 
00091 const char DjVuTXT::end_of_column    = 013;      // VT: Vertical Tab
00092 const char DjVuTXT::end_of_region    = 035;      // GS: Group Separator
00093 const char DjVuTXT::end_of_paragraph = 037;      // US: Unit Separator
00094 const char DjVuTXT::end_of_line      = 012;      // LF: Line Feed
00095 
00096 const int DjVuTXT::Zone::version  = 1;
00097 
00098 DjVuTXT::Zone::Zone()
00099   : ztype(DjVuTXT::PAGE), text_start(0), text_length(0), zone_parent(0)
00100 {
00101 }
00102 
00103 DjVuTXT::Zone *
00104 DjVuTXT::Zone::append_child()
00105 {
00106   Zone empty;
00107   empty.ztype = ztype;
00108   empty.text_start = 0;
00109   empty.text_length = 0;
00110   empty.zone_parent=this;
00111   children.append(empty);
00112   return & children[children.lastpos()];
00113 }
00114 
00115 void
00116 DjVuTXT::Zone::cleartext()
00117 {
00118   text_start = 0;
00119   text_length = 0;
00120   for (GPosition i=children; i; ++i)
00121     children[i].cleartext();
00122 }
00123 
00124 void
00125 DjVuTXT::Zone::normtext(const char *instr, GUTF8String &outstr)
00126 {
00127   if (text_length == 0)
00128     {
00129       // Descend collecting text below
00130       text_start = outstr.length();
00131       for (GPosition i=children; i; ++i)
00132         children[i].normtext(instr, outstr);
00133       text_length = outstr.length() - text_start;
00134       // Ignore empty zones
00135       if (text_length == 0)
00136         return;
00137     }
00138   else
00139     {
00140       // Collect text at this level
00141       int new_start = outstr.length();
00142       outstr = outstr + GUTF8String(instr+text_start, text_length);
00143       text_start = new_start;
00144       // Clear textual information on lower level nodes
00145       for (GPosition i=children; i; ++i)
00146         children[i].cleartext();
00147     }
00148   // Determine standard separator
00149   char sep;
00150   switch (ztype)
00151     {
00152     case COLUMN:
00153       sep = end_of_column; break;
00154     case REGION:
00155       sep = end_of_region; break;
00156     case PARAGRAPH: 
00157       sep = end_of_paragraph; break;
00158     case LINE:
00159       sep = end_of_line; break;
00160     case WORD:
00161       sep = ' '; break;
00162     default:
00163       return;
00164     }
00165   // Add separator if not present yet.
00166   if (outstr[text_start+text_length-1] != sep)
00167     {
00168       outstr = outstr + GUTF8String(&sep, 1);
00169       text_length += 1;
00170     }
00171 }
00172 
00173 unsigned int 
00174 DjVuTXT::Zone::memuse() const
00175 {
00176   int memuse = sizeof(*this);
00177   for (GPosition i=children; i; ++i)
00178     memuse += children[i].memuse();
00179   return memuse;
00180 }
00181 
00182 
00183 #ifndef NEED_DECODER_ONLY
00184 void 
00185 DjVuTXT::Zone::encode(
00186   const GP<ByteStream> &gbs, const Zone * parent, const Zone * prev) const
00187 {
00188   ByteStream &bs=*gbs;
00189   // Encode type
00190   bs.write8(ztype);
00191   
00192   // Modify text_start and bounding rectangle based on the context
00193   // (whether there is a previous non-zero same-level-child or parent)
00194   int start=text_start;
00195   int x=rect.xmin, y=rect.ymin;
00196   int width=rect.width(), height=rect.height();
00197   if (prev)
00198   {
00199     if (ztype==PAGE || ztype==PARAGRAPH || ztype==LINE)
00200     {
00201       // Encode offset from the lower left corner of the previous
00202       // child in the coord system in that corner with x to the
00203       // right and y down
00204       x=x-prev->rect.xmin;
00205       y=prev->rect.ymin-(y+height);
00206     } else // Either COLUMN or WORD or CHARACTER
00207     {
00208       // Encode offset from the lower right corner of the previous
00209       // child in the coord system in that corner with x to the
00210       // right and y up
00211       x=x-prev->rect.xmax;
00212       y=y-prev->rect.ymin;
00213     }
00214     start-=prev->text_start+prev->text_length;
00215   } else if (parent)
00216   {
00217     // Encode offset from the upper left corner of the parent
00218     // in the coord system in that corner with x to the right and y down
00219     x=x-parent->rect.xmin;
00220     y=parent->rect.ymax-(y+height);
00221     start-=parent->text_start;
00222   }
00223   // Encode rectangle
00224   bs.write16(0x8000+x);
00225   bs.write16(0x8000+y);
00226   bs.write16(0x8000+width);
00227   bs.write16(0x8000+height);
00228   // Encode text info
00229   bs.write16(0x8000+start);
00230   bs.write24(text_length);
00231   // Encode number of children
00232   bs.write24(children.size());
00233   
00234   const Zone * prev_child=0;
00235   // Encode all children
00236   for (GPosition i=children; i; ++i)
00237   {
00238     children[i].encode(gbs, this, prev_child);
00239     prev_child=&children[i];
00240   }
00241 }
00242 #endif
00243 
00244 void 
00245 DjVuTXT::Zone::decode(const GP<ByteStream> &gbs, int maxtext,
00246               const Zone * parent, const Zone * prev)
00247 {
00248   ByteStream &bs=*gbs;
00249   // Decode type
00250   ztype = (ZoneType) bs.read8();
00251   if ( ztype<PAGE || ztype>CHARACTER )
00252     G_THROW( ERR_MSG("DjVuText.corrupt_text") );
00253 
00254   // Decode coordinates
00255   int x=(int) bs.read16()-0x8000;
00256   int y=(int) bs.read16()-0x8000;
00257   int width=(int) bs.read16()-0x8000;
00258   int height=(int) bs.read16()-0x8000;
00259 
00260   // Decode text info
00261   text_start = (int) bs.read16()-0x8000;
00262 //  int start=text_start;
00263   text_length = bs.read24();
00264   if (prev)
00265   {
00266     if (ztype==PAGE || ztype==PARAGRAPH || ztype==LINE)
00267     {
00268       x=x+prev->rect.xmin;
00269       y=prev->rect.ymin-(y+height);
00270     } else // Either COLUMN or WORD or CHARACTER
00271     {
00272       x=x+prev->rect.xmax;
00273       y=y+prev->rect.ymin;
00274     }
00275     text_start+=prev->text_start+prev->text_length;
00276   } else if (parent)
00277   {
00278     x=x+parent->rect.xmin;
00279     y=parent->rect.ymax-(y+height);
00280     text_start+=parent->text_start;
00281   }
00282   rect=GRect(x, y, width, height);
00283   // Get children size
00284   int size = bs.read24();
00285 
00286   // Checks
00287   if (rect.isempty() || text_start<0 || text_start+text_length>maxtext )
00288     G_THROW( ERR_MSG("DjVuText.corrupt_text") );
00289 
00290   // Process children
00291   const Zone * prev_child=0;
00292   children.empty();
00293   while (size-- > 0) 
00294   {
00295     Zone *z = append_child();
00296     z->decode(gbs, maxtext, this, prev_child);
00297     prev_child=z;
00298   }
00299 }
00300 
00301 void 
00302 DjVuTXT::normalize_text()
00303 {
00304   GUTF8String newtextUTF8;
00305   page_zone.normtext( (const char*)textUTF8, newtextUTF8 );
00306   textUTF8 = newtextUTF8;
00307 }
00308 
00309 int 
00310 DjVuTXT::has_valid_zones() const
00311 {
00312   if (!textUTF8)
00313     return false;
00314   if (page_zone.children.isempty() || page_zone.rect.isempty()) 
00315     return false;
00316   return true;
00317 }
00318 
00319 
00320 #ifndef NEED_DECODER_ONLY
00321 void 
00322 DjVuTXT::encode(const GP<ByteStream> &gbs) const
00323 {
00324   ByteStream &bs=*gbs;
00325   if (! textUTF8 )
00326     G_THROW( ERR_MSG("DjVuText.no_text") );
00327   // Encode text
00328   int textsize = textUTF8.length();
00329   bs.write24( textsize );
00330   bs.writall( (void*)(const char*)textUTF8, textsize );
00331   // Encode zones
00332   if (has_valid_zones())
00333   {
00334     bs.write8(Zone::version);
00335     page_zone.encode(gbs);
00336   }
00337 }
00338 #endif
00339 
00340 void 
00341 DjVuTXT::decode(const GP<ByteStream> &gbs)
00342 {
00343   ByteStream &bs=*gbs;
00344   // Read text
00345   textUTF8.empty();
00346   int textsize = bs.read24();
00347   char *buffer = textUTF8.getbuf(textsize);
00348   int readsize = bs.read(buffer,textsize);
00349   buffer[readsize] = 0;
00350   if (readsize < textsize)
00351     G_THROW( ERR_MSG("DjVuText.corrupt_chunk") );
00352   // Try reading zones
00353   unsigned char version;
00354   if ( bs.read( (void*) &version, 1 ) == 1) 
00355   {
00356     if (version != Zone::version)
00357       G_THROW( ERR_MSG("DjVuText.bad_version") "\t" + GUTF8String(version) );
00358     page_zone.decode(gbs, textsize);
00359   }
00360 }
00361 
00362 GP<DjVuTXT> 
00363 DjVuTXT::copy(void) const
00364 {
00365   return new DjVuTXT(*this);
00366 }
00367 
00368 
00369 static inline bool
00370 intersects_zone(GRect box, const GRect &zone)
00371 {
00372   return
00373     ((box.xmin < zone.xmin)
00374       ?(box.xmax >= zone.xmin)
00375       :(box.xmin <= zone.xmax))
00376     &&((box.ymin < zone.ymin)
00377       ?(box.ymax >= zone.ymin)
00378       :(box.ymin <= zone.ymax));
00379 }
00380 
00381 void
00382 DjVuTXT::Zone::get_text_with_rect(const GRect &box, 
00383                                   int &string_start, int &string_end) const
00384 {
00385   GPosition pos=children;
00386   if(pos?box.contains(rect):intersects_zone(box,rect))
00387   {
00388     const int text_end=text_start+text_length;
00389     if(string_start == string_end)
00390     {
00391       string_start=text_start;
00392       string_end=text_end;
00393     }else
00394     {
00395       if (string_end < text_end)
00396         string_end=text_end;
00397       if(text_start < string_start)
00398         string_start=text_start;
00399     }
00400   }else if(pos&&intersects_zone(box,rect))
00401   {
00402     do
00403     {
00404       children[pos].get_text_with_rect(box,string_start,string_end);
00405     } while(++pos);
00406   }
00407 }
00408 
00409 void
00410 DjVuTXT::Zone::find_zones(GList<Zone *> &list, 
00411                           const int string_start, const int string_end) const
00412 {
00413   const int text_end=text_start+text_length;
00414   if(text_start >= string_start)
00415     {
00416       if(text_end <= string_end)
00417         {
00418           list.append(const_cast<Zone *>(this));
00419         }
00420       else if(text_start < string_end)
00421         {
00422           if (children.size())
00423             for (GPosition pos=children; pos; ++pos)
00424               children[pos].find_zones(list,string_start,string_end);
00425           else
00426             list.append(const_cast<Zone *>(this));
00427         }
00428     }
00429   else if( text_end > string_start)
00430     {
00431       if (children.size())
00432         for (GPosition pos=children; pos; ++pos)
00433           children[pos].find_zones(list,string_start,string_end);
00434       else
00435         list.append(const_cast<Zone *>(this));
00436     }
00437 }
00438 
00439 void
00440 DjVuTXT::Zone::get_smallest(GList<GRect> &list) const
00441 {
00442   GPosition pos=children;
00443   if(pos)
00444     {
00445       do {
00446         children[pos].get_smallest(list);
00447       } while (++pos);
00448     }
00449   else
00450     {
00451       list.append(rect);
00452     }
00453 }
00454 
00455 void
00456 DjVuTXT::Zone::get_smallest(GList<GRect> &list, const int padding) const
00457 {
00458   GPosition pos=children;
00459   if(pos)
00460     {
00461       do {
00462         children[pos].get_smallest(list,padding);
00463       } while (++pos);
00464     }
00465   else if(zone_parent && zone_parent->ztype >= PARAGRAPH)
00466     {
00467       const GRect &xrect=zone_parent->rect;
00468       if(xrect.height() < xrect.width())
00469         {
00470           list.append(GRect(rect.xmin-padding,xrect.ymin-padding,rect.width()
00471                             +2*padding,xrect.height()+2*padding));
00472         }
00473       else
00474         {
00475           list.append(GRect(xrect.xmin-padding,rect.ymin-padding,xrect.width()
00476                             +2*padding,rect.height()+2*padding));
00477         }
00478     }
00479   else
00480     {
00481       list.append(GRect(rect.xmin-padding,rect.ymin-padding,rect.width()
00482                         +2*padding,rect.height()+2*padding));
00483     }
00484 }
00485 
00486 void
00487 DjVuTXT::get_zones(int zone_type, const Zone *parent, 
00488                    GList<Zone *> & zone_list) const 
00489    // get all the zones of  type zone_type under zone node parent
00490 {
00491    // search all branches under parent
00492    const Zone *zone=parent;
00493    for( int cur_ztype=zone->ztype; cur_ztype<zone_type; ++cur_ztype )
00494    {
00495       GPosition pos;
00496       for(pos=zone->children; pos; ++pos)
00497       {
00498      Zone *zcur=(Zone *)&zone->children[pos];
00499      if ( zcur->ztype == zone_type )
00500      {
00501         GPosition zpos=zone_list;
00502         if ( !zone_list.search(zcur,zpos) )
00503            zone_list.append(zcur);
00504      }
00505      else if ( zone->children[pos].ztype < zone_type )
00506         get_zones(zone_type, &zone->children[pos], zone_list);
00507       }
00508    }
00509 }
00510 
00511 GList<GRect>
00512 DjVuTXT::find_text_with_rect(const GRect &box, GUTF8String &text, 
00513                              const int padding) const
00514 {
00515   GList<GRect> retval;
00516   int text_start=0;
00517   int text_end=0;
00518   page_zone.get_text_with_rect(box,text_start,text_end);
00519   if(text_start != text_end)
00520   {
00521     GList<Zone *> zones;
00522     page_zone.find_zones(zones,text_start,text_end);
00523     GPosition pos=zones;
00524     if(pos)
00525     {
00526       do
00527       {
00528         if(padding >= 0)
00529         {
00530           zones[pos]->get_smallest(retval,padding);
00531         }else
00532         {
00533           zones[pos]->get_smallest(retval);
00534         }
00535       } while(++pos);
00536     }
00537   }
00538   text=textUTF8.substr(text_start,text_end-text_start);
00539   return retval;
00540 }
00541 
00542 
00543 GList<DjVuTXT::Zone *>
00544 DjVuTXT::find_text_in_rect(GRect target_rect, GUTF8String &text) const
00545    // returns a list of zones of type WORD in the nearest/selected paragraph 
00546 {
00547    GList<Zone *> zone_list;
00548    GList<Zone *> lines;
00549 
00550    get_zones((int)PARAGRAPH, &page_zone, zone_list);
00551    // it's possible that no paragraph structure exists for reasons that  
00552    // 1) ocr engine is not capable 2) file was modified by user. In such case, 
00553    // we can only make a rough guess, i.e., select all the lines intersected with
00554    // target_rect
00555    if (zone_list.isempty())
00556    {
00557       get_zones((int)LINE, &page_zone, zone_list);
00558       GPosition pos;
00559       for(pos=zone_list; pos; ++pos)
00560       {
00561      GRect rect=zone_list[pos]->rect;
00562      int h0=rect.height()/2;
00563      if(rect.intersect(rect,target_rect) && rect.height()>h0)
00564         lines.append(zone_list[pos]);
00565       }
00566    } else 
00567    {
00568       GPosition pos, pos_sel=zone_list;
00569       float ar=0;
00570       for(pos=zone_list; pos; ++pos)
00571       {
00572      GRect rect=zone_list[pos]->rect;
00573      int area=rect.area();
00574      if (rect.intersect(rect, target_rect))
00575      {
00576         float ftmp=rect.area()/(float)area;
00577         if ( !ar || ar<ftmp )
00578         {
00579            ar=ftmp;
00580            pos_sel=pos;
00581         }
00582      }
00583       }
00584       Zone *parag = 0;
00585       if ( ar>0 ) parag=zone_list[pos_sel];
00586       zone_list.empty();
00587       if ( ar>0 ) 
00588       {
00589      get_zones((int)LINE, parag, zone_list);
00590      if ( !zone_list.isempty() )
00591      {
00592         for(GPosition pos=zone_list; pos; ++pos)
00593         {
00594            GRect rect=zone_list[pos]->rect;
00595            int h0=rect.height()/2;
00596            if(rect.intersect(rect,target_rect) && rect.height()>h0)
00597           lines.append(zone_list[pos]);
00598         }
00599      }
00600       }
00601    }
00602 
00603    zone_list.empty();
00604    if (!lines.isempty()) 
00605    {
00606       int i=1, lsize=lines.size();
00607 
00608       GList<Zone *> words;
00609       for (GPosition pos=lines; pos; ++pos, ++i)
00610       {
00611      words.empty();
00612      get_zones((int)WORD, lines[pos], words);
00613 
00614      if ( lsize==1 )
00615      {
00616         for(GPosition p=words;p;++p)
00617         {
00618            GRect rect=words[p]->rect;
00619            if(rect.intersect(rect,target_rect))
00620            //if (target_rect.contains(words[p]->rect))
00621           zone_list.append(words[p]);
00622         }
00623      } else
00624      {
00625         if (i==1)
00626         {
00627            bool start=true;
00628            for(GPosition p=words; p; ++p)
00629            {
00630           if ( start )
00631           {
00632              GRect rect=words[p]->rect;
00633              if(rect.intersect(rect,target_rect))
00634             //if (target_rect.contains(words[p]->rect))
00635              {
00636             start=false;
00637             zone_list.append(words[p]);
00638              }
00639           } else 
00640              zone_list.append(words[p]);
00641            }
00642         } else if (i==lsize)
00643         {
00644            bool end=true;
00645            for(GPosition p=words.lastpos();p;--p)
00646            {
00647           if ( end )
00648           {
00649              GRect rect=words[p]->rect;
00650              if(rect.intersect(rect,target_rect))
00651             //if(target_rect.contains(words[p]->rect) )
00652              {
00653             end=false;
00654             zone_list.append(words[p]);
00655              }
00656           } else 
00657              zone_list.append(words[p]);
00658            }
00659         }
00660 
00661         if (i!=1 && i!=lsize )
00662         {
00663            for(GPosition p=words;p;++p)
00664           zone_list.append(words[p]);
00665         }
00666      }
00667       }
00668    } 
00669 
00670    return zone_list;
00671 }
00672 
00673 unsigned int 
00674 DjVuTXT::get_memory_usage() const
00675 {
00676   return sizeof(*this) + textUTF8.length() + page_zone.memuse() - sizeof(page_zone); 
00677 }
00678 
00679 
00680 
00681 //***************************************************************************
00682 //******************************** DjVuText *********************************
00683 //***************************************************************************
00684 
00685 void
00686 DjVuText::decode(const GP<ByteStream> &gbs)
00687 {
00688   GUTF8String chkid;
00689   GP<IFFByteStream> giff=IFFByteStream::create(gbs);
00690   IFFByteStream &iff=*giff;
00691   while( iff.get_chunk(chkid) )
00692   {
00693     if (chkid == "TXTa")
00694     {
00695       if (txt)
00696         G_THROW( ERR_MSG("DjVuText.dupl_text") );
00697       txt = DjVuTXT::create();
00698       txt->decode(iff.get_bytestream());
00699     }
00700     else if (chkid == "TXTz")
00701     {
00702       if (txt)
00703         G_THROW( ERR_MSG("DjVuText.dupl_text") );
00704       txt = DjVuTXT::create();
00705       const GP<ByteStream> gbsiff=BSByteStream::create(iff.get_bytestream());
00706       txt->decode(gbsiff);
00707     }
00708     // Add decoding of other chunks here
00709     iff.close_chunk();
00710   }
00711 }
00712 
00713 void
00714 DjVuText::encode(const GP<ByteStream> &gbs)
00715 {
00716   if (txt)
00717   {
00718     const GP<IFFByteStream> giff=IFFByteStream::create(gbs);
00719     IFFByteStream &iff=*giff;
00720     iff.put_chunk("TXTz");
00721     {
00722       GP<ByteStream> gbsiff=BSByteStream::create(iff.get_bytestream(),50);
00723       txt->encode(gbsiff);
00724     }
00725     iff.close_chunk();
00726   }
00727   // Add encoding of other chunks here
00728 }
00729 
00730 
00731 GP<DjVuText>
00732 DjVuText::copy(void) const
00733 {
00734    GP<DjVuText> text= new DjVuText;
00735       // Copy any primitives (if any)
00736    *text=*this;
00737       // Copy each substructure
00738    if (txt)
00739      text->txt = txt->copy();
00740    return text;
00741 }
00742 
00743 static GUTF8String
00744 indent ( int spaces)
00745 {
00746   GUTF8String ret;
00747   for( int i = 0 ; i < spaces ; i++ )
00748     ret += ' ';
00749   return ret;
00750 }
00751 
00752 static const char *tags[8]=
00753 { 0,
00754   "HIDDENTEXT",
00755   "PAGECOLUMN",
00756   "REGION",
00757   "PARAGRAPH",
00758   "LINE",
00759   "WORD",
00760   "CHARACTER" };
00761 static const int tags_size=sizeof(tags)/sizeof(const char *);
00762 
00763 static GUTF8String
00764 start_tag(const DjVuTXT::ZoneType zone)
00765 {
00766   GUTF8String retval;
00767   if((tags_size > (int)zone)&&((int)zone > 0))
00768   {
00769     switch (zone)
00770     {
00771       case DjVuTXT::CHARACTER:
00772         retval="<"+GUTF8String(tags[zone])+">";
00773         break;
00774       case DjVuTXT::WORD:
00775         retval=indent(2*(int)zone+2)+"<"+tags[zone]+">";
00776         break;
00777       default:
00778         retval=indent(2*(int)zone+2)+"<"+tags[zone]+">\n";
00779         break;
00780     }
00781   }
00782   return retval;
00783 }
00784 
00785 static GUTF8String
00786 start_tag(const DjVuTXT::ZoneType zone, const GUTF8String &attributes)
00787 {
00788   GUTF8String retval;
00789   if((tags_size > (int)zone)&&((int)zone > 0))
00790   {
00791     switch (zone)
00792     {
00793       case DjVuTXT::CHARACTER:
00794         retval="<"+GUTF8String(tags[zone])+" "+attributes+">";
00795         break;
00796       case DjVuTXT::WORD:
00797         retval=indent(2*(int)zone+2)+"<"+tags[zone]+" "+attributes+">";
00798         break;
00799       default:
00800         retval=indent(2*(int)zone+2)+"<"+tags[zone]+" "+attributes+">\n";
00801         break;
00802     }
00803   }
00804   return retval;
00805 }
00806 
00807 static inline GUTF8String
00808 start_tag(const int layer)
00809 {
00810   return start_tag((const DjVuTXT::ZoneType)layer);
00811 }
00812 
00813 
00814 static GUTF8String
00815 end_tag(const DjVuTXT::ZoneType zone)
00816 {
00817   GUTF8String retval;
00818   if((tags_size > (int)zone)&&((int)zone >= 0))
00819   {
00820     switch (zone)
00821     {
00822       case DjVuTXT::CHARACTER:
00823         retval="</"+GUTF8String(tags[zone])+">";
00824         break;
00825       case DjVuTXT::WORD:
00826         retval="</"+GUTF8String(tags[zone])+">\n";
00827         break;
00828       default:
00829         retval=indent(2*(int)zone+2)+"</"+tags[zone]+">\n";
00830         break;
00831     }
00832   }
00833   return retval;
00834 }
00835 
00836 static inline GUTF8String
00837 end_tag(const int layer)
00838 {
00839   return end_tag((const DjVuTXT::ZoneType)layer);
00840 }
00841 
00842 static GUTF8String
00843 tolayer(int &layer, const DjVuTXT::ZoneType next_layer)
00844 {
00845   GUTF8String retval;
00846   for( ;layer < (int)next_layer;layer++ )
00847   {
00848     retval+=start_tag(layer);
00849   }
00850   while (layer > (int)next_layer )
00851   {
00852     retval+=end_tag(--layer);
00853   }
00854   return retval;
00855 }
00856 
00857 static void
00858 writeText( ByteStream & str_out,
00859             const GUTF8String &textUTF8,
00860             const DjVuTXT::Zone &zone,
00861             const int WindowHeight );
00862 
00863 static void
00864 writeText( ByteStream & str_out,
00865            const GUTF8String &textUTF8,
00866            const DjVuTXT::ZoneType zlayer,
00867            const GList<DjVuTXT::Zone> &children,
00868            const int WindowHeight )
00869 {
00870 //  assert( txt->has_valid_zones() );
00871 //  DEBUG_MSG( "--zonetype=" << txt->page_zone.ztype << "\n" );
00872 
00873   //  Beginning tags for missing layers
00874   int layer=(int)zlayer;
00875   //  Output the next layer
00876   for(GPosition pos=children ; pos ; ++pos )
00877   {
00878     str_out.writestring(tolayer(layer,children[pos].ztype));
00879     writeText( str_out,
00880                 textUTF8,
00881                 children[pos],
00882                 WindowHeight );
00883   }
00884   str_out.writestring(tolayer(layer,zlayer));
00885 }
00886 
00887 static void
00888 writeText( ByteStream & str_out,
00889             const GUTF8String &textUTF8,
00890             const DjVuTXT::Zone &zone,
00891             const int WindowHeight )
00892 {
00893 //  DEBUG_MSG( "--zonetype=" << zone.ztype << "\n" );
00894 
00895   const GUTF8String xindent(indent( 2 * zone.ztype + 2 ));
00896   GPosition pos=zone.children;
00897   // Build attribute string
00898   if( ! pos )
00899   {
00900     GUTF8String coords;
00901     coords.format("coords=\"%d,%d,%d,%d\"",
00902       zone.rect.xmin, WindowHeight - 1 - zone.rect.ymin,
00903       zone.rect.xmax, WindowHeight - 1 - zone.rect.ymax);
00904     const int start=zone.text_start;
00905     const int end=textUTF8.firstEndSpace(start,zone.text_length);
00906     str_out.writestring(start_tag(zone.ztype,coords));
00907     str_out.writestring(textUTF8.substr(start,end-start).toEscaped());
00908     str_out.writestring(end_tag(zone.ztype));
00909   } else
00910   {
00911     writeText(str_out,textUTF8,zone.ztype,zone.children,WindowHeight);
00912   }
00913 }
00914 
00915 void
00916 DjVuTXT::writeText(ByteStream &str_out,const int height) const
00917 {
00918   if(has_valid_zones())
00919   {
00920     ::writeText(str_out,textUTF8,DjVuTXT::PAGE,page_zone.children,height);
00921   }else
00922   {
00923     str_out.writestring(start_tag(DjVuTXT::PAGE));
00924     str_out.writestring(end_tag(DjVuTXT::PAGE));
00925   }
00926 }
00927 
00928 void
00929 DjVuText::writeText(ByteStream &str_out,const int height) const
00930 {
00931   if(txt)
00932   {
00933     txt->writeText(str_out,height);
00934   }else
00935   {
00936     str_out.writestring("<"+GUTF8String(tags[DjVuTXT::PAGE])+"/>\n");
00937   }
00938    
00939 }
00940 GUTF8String
00941 DjVuTXT::get_xmlText(const int height) const
00942 {
00943   GP<ByteStream> gbs(ByteStream::create());
00944   ByteStream &bs=*gbs;
00945   writeText(bs,height);
00946   bs.seek(0L);
00947   return bs.getAsUTF8();
00948 }
00949 
00950 GUTF8String
00951 DjVuText::get_xmlText(const int height) const
00952 {
00953   GUTF8String retval;
00954   if(txt)
00955   {
00956     retval=txt->get_xmlText(height);
00957   }else
00958   {
00959     retval="<"+GUTF8String(tags[DjVuTXT::PAGE])+"/>\n";
00960   }
00961   return retval;
00962 }
00963 
00964 
00965 #ifdef HAVE_NAMESPACES
00966 }
00967 # ifndef NOT_USING_DJVU_NAMESPACE
00968 using namespace DJVU;
00969 # endif
00970 #endif
00971 

kviewshell

Skip menu "kviewshell"
  • Main Page
  • Namespace List
  • Class Hierarchy
  • Alphabetical List
  • Class List
  • File List
  • Namespace Members
  • Class Members

API Reference

Skip menu "API Reference"
  • kviewshell
Generated for API Reference by doxygen 1.5.9
This website is maintained by Adriaan de Groot and Allen Winter.
KDE® and the K Desktop Environment® logo are registered trademarks of KDE e.V. | Legal