• Skip to content
  • Skip to link menu
KDE 3.5 API Reference
  • KDE API Reference
  • API Reference
  • Sitemap
  • Contact Us
 

kviewshell

XMLParser.cpp

Go to the documentation of this file.
00001 //C-  -*- C++ -*-
00002 //C- -------------------------------------------------------------------
00003 //C- DjVuLibre-3.5
00004 //C- Copyright (c) 2002  Leon Bottou and Yann Le Cun.
00005 //C- Copyright (c) 2001  AT&T
00006 //C-
00007 //C- This software is subject to, and may be distributed under, the
00008 //C- GNU General Public License, Version 2. The license should have
00009 //C- accompanied the software or you may obtain a copy of the license
00010 //C- from the Free Software Foundation at http://www.fsf.org .
00011 //C-
00012 //C- This program is distributed in the hope that it will be useful,
00013 //C- but WITHOUT ANY WARRANTY; without even the implied warranty of
00014 //C- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
00015 //C- GNU General Public License for more details.
00016 //C- 
00017 //C- DjVuLibre-3.5 is derived from the DjVu(r) Reference Library
00018 //C- distributed by Lizardtech Software.  On July 19th 2002, Lizardtech 
00019 //C- Software authorized us to replace the original DjVu(r) Reference 
00020 //C- Library notice by the following text (see doc/lizard2002.djvu):
00021 //C-
00022 //C-  ------------------------------------------------------------------
00023 //C- | DjVu (r) Reference Library (v. 3.5)
00024 //C- | Copyright (c) 1999-2001 LizardTech, Inc. All Rights Reserved.
00025 //C- | The DjVu Reference Library is protected by U.S. Pat. No.
00026 //C- | 6,058,214 and patents pending.
00027 //C- |
00028 //C- | This software is subject to, and may be distributed under, the
00029 //C- | GNU General Public License, Version 2. The license should have
00030 //C- | accompanied the software or you may obtain a copy of the license
00031 //C- | from the Free Software Foundation at http://www.fsf.org .
00032 //C- |
00033 //C- | The computer code originally released by LizardTech under this
00034 //C- | license and unmodified by other parties is deemed "the LIZARDTECH
00035 //C- | ORIGINAL CODE."  Subject to any third party intellectual property
00036 //C- | claims, LizardTech grants recipient a worldwide, royalty-free, 
00037 //C- | non-exclusive license to make, use, sell, or otherwise dispose of 
00038 //C- | the LIZARDTECH ORIGINAL CODE or of programs derived from the 
00039 //C- | LIZARDTECH ORIGINAL CODE in compliance with the terms of the GNU 
00040 //C- | General Public License.   This grant only confers the right to 
00041 //C- | infringe patent claims underlying the LIZARDTECH ORIGINAL CODE to 
00042 //C- | the extent such infringement is reasonably necessary to enable 
00043 //C- | recipient to make, have made, practice, sell, or otherwise dispose 
00044 //C- | of the LIZARDTECH ORIGINAL CODE (or portions thereof) and not to 
00045 //C- | any greater extent that may be necessary to utilize further 
00046 //C- | modifications or combinations.
00047 //C- |
00048 //C- | The LIZARDTECH ORIGINAL CODE is provided "AS IS" WITHOUT WARRANTY
00049 //C- | OF ANY KIND, EITHER EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED
00050 //C- | TO ANY WARRANTY OF NON-INFRINGEMENT, OR ANY IMPLIED WARRANTY OF
00051 //C- | MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE.
00052 //C- +------------------------------------------------------------------
00053 // 
00054 // $Id: XMLParser.cpp,v 1.10 2003/11/07 22:08:22 leonb Exp $
00055 // $Name: release_3_5_15 $
00056 
00057 #ifdef HAVE_CONFIG_H
00058 # include "config.h"
00059 #endif
00060 #if NEED_GNUG_PRAGMAS
00061 # pragma implementation
00062 #endif
00063 
00064 // From: Leon Bottou, 1/31/2002
00065 // This is purely Lizardtech stuff.
00066 
00067 #include "XMLParser.h"
00068 #include "XMLTags.h"
00069 #include "ByteStream.h"
00070 #include "GOS.h"
00071 #include "DjVuDocument.h"
00072 #include "DjVuText.h"
00073 #include "DjVuAnno.h"
00074 #include "DjVuFile.h"
00075 #include "DjVuImage.h"
00076 #include "debug.h"
00077 #include <stdio.h>
00078 #include <ctype.h>
00079 #include <stdlib.h>
00080 
00081 
00082 #ifdef HAVE_NAMESPACES
00083 namespace DJVU {
00084 # ifdef NOT_DEFINED // Just to fool emacs c++ mode
00085 }
00086 #endif
00087 #endif
00088 
00089 static const char mimetype[]="image/x.djvu";
00090 static const char bodytag[]="BODY";
00091 static const char areatag[]="AREA";
00092 static const char maptag[]="MAP";
00093 static const char objecttag[]="OBJECT";
00094 static const char paramtag[]="PARAM";
00095 static const char wordtag[]="WORD";
00096 static const char linetag[]="LINE";
00097 static const char paragraphtag[]="PARAGRAPH";
00098 static const char regiontag[]="REGION";
00099 static const char pagecolumntag[]="PAGECOLUMN";
00100 static const char hiddentexttag[]="HIDDENTEXT";
00101 static const char metadatatag[]="METADATA";
00102 
00103 class lt_XMLParser::Impl : public lt_XMLParser
00104 {
00105 public:
00106   Impl(void);
00107   virtual ~Impl();
00109   virtual void parse(const GP<ByteStream> &bs);
00111   virtual void parse(const lt_XMLTags &tags);
00113   virtual void save(void);
00115   virtual void empty(void);
00116 protected:
00117   GP<DjVuFile> get_file(const GURL &url,GUTF8String page);
00118 
00119   void parse_anno(const int width, const int height,
00120     const lt_XMLTags &GObject,
00121     GMap<GUTF8String,GP<lt_XMLTags> > &Maps, DjVuFile &dfile);
00122 
00123   void parse_text(const int width, const int height,
00124     const lt_XMLTags &GObject, DjVuFile &dfile);
00125 
00126   void parse_meta(const lt_XMLTags &GObject, DjVuFile &dfile);
00127 
00128   void ChangeAnno( const int width, const int height,
00129     DjVuFile &dfile, const lt_XMLTags &map);
00130 
00131   void ChangeInfo(DjVuFile &dfile,const int dpi,const double gamma);
00132 
00133   void ChangeText( const int width, const int height,
00134     DjVuFile &dfile, const lt_XMLTags &map);
00135 
00136   void ChangeMeta( DjVuFile &dfile, const lt_XMLTags &map);
00137 
00138   void ChangeTextOCR( const GUTF8String &value, 
00139     const int width, const int height,
00140     const GP<DjVuFile> &dfile);
00141 
00142   // we may want to make these list of modified file static so
00143   // they only needed to be loaded and saved once.
00144 
00145   GMap<GUTF8String,GP<DjVuFile> > m_files;
00146   GMap<GUTF8String,GP<DjVuDocument> > m_docs;
00147 
00148   GURL m_codebase; 
00149   GCriticalSection xmlparser_lock;
00150 };
00151 
00152 static GP<ByteStream>
00153 OCRcallback(
00154   void * const xarg,
00155   lt_XMLParser::mapOCRcallback * const xcallback,
00156   const GUTF8String &value=GUTF8String(),
00157   const GP<DjVuImage> &image=0 );
00158 
00159 static inline GP<ByteStream>
00160 OCRcallback(const GUTF8String &value, const GP<DjVuImage> &image)
00161 {
00162   return OCRcallback(0,0,value,image);
00163 }
00164 
00165 lt_XMLParser::lt_XMLParser() {}
00166 lt_XMLParser::~lt_XMLParser() {}
00167 lt_XMLParser::Impl::Impl() {}
00168 lt_XMLParser::Impl::~Impl() {}
00169 
00170 GP<lt_XMLParser>
00171 lt_XMLParser::create(void)
00172 {
00173   return new lt_XMLParser::Impl;
00174 }
00175 
00176 // helper function for args
00177 static void 
00178 intList(GUTF8String coords, GList<int> &retval)
00179 {
00180   int pos=0;
00181   while(coords.length())
00182   {
00183     int epos;
00184     unsigned long i=coords.toLong(pos,epos,10);
00185     if(epos>=0)
00186     {
00187       retval.append(i);
00188       const int n=coords.nextNonSpace(epos);
00189       if(coords[n] != ',')
00190         break;
00191       pos=n+1;
00192     }
00193   }
00194 }
00195 
00196 void 
00197 lt_XMLParser::Impl::empty(void)
00198 {
00199   GCriticalSectionLock lock(&xmlparser_lock);
00200   m_files.empty();
00201   m_docs.empty();
00202 }
00203 
00204 void 
00205 lt_XMLParser::Impl::save(void)
00206 {
00207   GCriticalSectionLock lock(&xmlparser_lock);
00208   for(GPosition pos=m_docs;pos;++pos)
00209   {
00210     const GP<DjVuDocument> doc(m_docs[pos]);
00211     const GURL url=doc->get_init_url();
00212     
00213     DEBUG_MSG("Saving "<<(const char *)url<<" with new text and annotations\n");
00214     const bool bundle=doc->is_bundled()||(doc->get_doc_type()==DjVuDocument::SINGLE_PAGE);
00215     doc->save_as(url,bundle);
00216   }
00217   empty();
00218 }
00219 
00220 void
00221 lt_XMLParser::Impl::parse(const GP<ByteStream> &bs)
00222 {
00223   const GP<lt_XMLTags> tags(lt_XMLTags::create(bs));
00224   parse(*tags);
00225 }
00226   
00227 static const GMap<GUTF8String,GMapArea::BorderType> &
00228 BorderTypeMap(void)
00229 {
00230   static GMap<GUTF8String,GMapArea::BorderType> typeMap;
00231   if (! typeMap.size()) 
00232     {
00233       typeMap["none"]=GMapArea::NO_BORDER;
00234       typeMap["xor"]=GMapArea::XOR_BORDER;
00235       typeMap["solid"]=GMapArea::SOLID_BORDER;
00236       typeMap["default"]=GMapArea::SOLID_BORDER;
00237       typeMap["shadowout"]=GMapArea::SHADOW_OUT_BORDER;
00238       typeMap["shadowin"]=GMapArea::SHADOW_IN_BORDER;
00239       typeMap["etchedin"]=GMapArea::SHADOW_EIN_BORDER;
00240       typeMap["etchedout"]=GMapArea::SHADOW_EOUT_BORDER;
00241     }
00242   return typeMap;
00243 }
00244 
00245 static unsigned long
00246 convertToColor(const GUTF8String &s)
00247 {
00248   unsigned long retval=0;
00249   if(s.length())
00250   {
00251     int endpos;
00252     if(s[0] == '#')
00253     {
00254       retval=s.substr(1,-1).toULong(0,endpos,16);
00255     }
00256     if(endpos < 0)
00257     {
00258       G_THROW( (ERR_MSG("XMLAnno.bad_color") "\t")+s );
00259     }
00260   }
00261   return retval;
00262 }
00263 
00264 void
00265 lt_XMLParser::Impl::ChangeInfo(DjVuFile &dfile,const int dpi,const double gamma)
00266 {
00267   GP<DjVuInfo> info;
00268   if(dpi >= 5 && dpi <= 4800)
00269   {
00270     dfile.resume_decode(true);
00271     if(dfile.info && (dpi != dfile.info->dpi) )
00272     {
00273       info=new DjVuInfo(*dfile.info);
00274       info->dpi=dpi;
00275     }
00276   }
00277   if(gamma >= 0.1 && gamma <= 5.0)
00278   {
00279     dfile.resume_decode(true);
00280     if(dfile.info && (gamma != dfile.info->gamma) )
00281     {
00282       if(!info)
00283         info=new DjVuInfo(*dfile.info);
00284       info->gamma=gamma;
00285     }
00286   }
00287   if(info)
00288   {
00289     dfile.change_info(info);
00290   }
00291 }
00292 
00293 void
00294 lt_XMLParser::Impl::ChangeAnno(
00295   const int width, const int height,
00296   DjVuFile &dfile, 
00297   const lt_XMLTags &map )
00298 {
00299   dfile.resume_decode(true);
00300   const GP<DjVuInfo> info(dfile.info);
00301   const GP<DjVuAnno> ganno(DjVuAnno::create());
00302   DjVuAnno &anno=*ganno;
00303   GPosition map_pos;
00304   map_pos=map.contains(areatag);
00305   if(dfile.contains_anno())
00306   {
00307     GP<ByteStream> annobs=dfile.get_merged_anno();
00308     if(annobs)
00309     {
00310       anno.decode(annobs);
00311       if(anno.ant && info)
00312       {
00313         anno.ant->map_areas.empty();
00314       }
00315     }
00316 //    dfile.remove_anno();
00317   }
00318   if(info && map_pos)
00319   {
00320     const int h=info->height;
00321     const int w=info->width;
00322     double ws=1.0;
00323     double hs=1.0;
00324     if(width && width != w)
00325     {
00326       ws=((double)w)/((double)width); 
00327     }
00328     if(height && height != h)
00329     {
00330       hs=((double)h)/((double)height); 
00331     }
00332     if(!anno.ant)
00333     {
00334       anno.ant=DjVuANT::create();
00335     }
00336     GPList<GMapArea> &map_areas=anno.ant->map_areas;
00337     map_areas.empty();
00338     GPList<lt_XMLTags> gareas=map[map_pos];
00339     for(GPosition pos=gareas;pos;++pos)
00340     {
00341       if(gareas[pos])
00342       {
00343         lt_XMLTags &areas=*(gareas[pos]);
00344         GMap<GUTF8String,GUTF8String> args(areas.get_args());
00345         GList<int> coords;
00346         // ******************************************************
00347         // Parse the coords attribute:  first read the raw data into
00348         // a list, then scale the x, y data into another list.  For
00349         // circles, you also get a radius element with (looks like an x
00350         // with no matching y).
00351         // ******************************************************
00352         {
00353           GPosition coords_pos=args.contains("coords");
00354           if(coords_pos)
00355           {
00356             GList<int> raw_coords;
00357             intList(args[coords_pos],raw_coords);
00358             for(GPosition raw_pos=raw_coords;raw_pos;++raw_pos)
00359             {
00360               const int r=raw_coords[raw_pos];
00361               const int x=(int)(ws*(double)r+0.5);
00362               coords.append(x);
00363               int y=h-1;
00364               if(! ++raw_pos)
00365               {
00366                 y-=(int)(hs*(double)r+0.5);
00367               }else
00368               {
00369                 y-=(int)(hs*(double)raw_coords[raw_pos]+0.5);
00370               }
00371               coords.append(y);
00372 //            DjVuPrintMessage("Coords (%d,%d)\n",x,y);
00373             }
00374           }
00375         }
00376         GUTF8String shape;
00377         {
00378           GPosition shape_pos=args.contains("shape");
00379           if(shape_pos)
00380           {
00381             shape=args[shape_pos];
00382           }
00383         }
00384         GP<GMapArea> a;
00385         if(shape == "default")
00386         {
00387           GRect rect(0,0,w,h);
00388           a=GMapRect::create(rect);
00389         }else if(!shape.length() || shape == "rect")
00390         {
00391           int xx[4];
00392           int i=0;
00393           for(GPosition rect_pos=coords;(rect_pos)&&(i<4);++rect_pos,++i)
00394           {
00395             xx[i]=coords[rect_pos];
00396           }
00397           if(i!=4)
00398           {
00399             G_THROW( ERR_MSG("XMLAnno.bad_rect") );
00400           }
00401           int xmin,xmax; 
00402           if(xx[0]>xx[2])
00403           {
00404             xmax=xx[0];
00405             xmin=xx[2];
00406           }else
00407           {
00408             xmin=xx[0];
00409             xmax=xx[2];
00410           }
00411           int ymin,ymax; 
00412           if(xx[1]>xx[3])
00413           {
00414             ymax=xx[1];
00415             ymin=xx[3];
00416           }else
00417           {
00418             ymin=xx[1];
00419             ymax=xx[3];
00420           }
00421           GRect rect(xmin,ymin,xmax-xmin,ymax-ymin);
00422           a=GMapRect::create(rect);
00423         }else if(shape == "circle")
00424         {
00425           int xx[4];
00426           int i=0;
00427           GPosition rect_pos=coords.lastpos();
00428           if(rect_pos)
00429           {
00430             coords.append(coords[rect_pos]);
00431             for(rect_pos=coords;(rect_pos)&&(i<4);++rect_pos)
00432             {
00433               xx[i++]=coords[rect_pos];
00434             }
00435           }
00436           if(i!=4)
00437           {
00438             G_THROW( ERR_MSG("XMLAnno.bad_circle") );
00439           }
00440           int x=xx[0],y=xx[1],rx=xx[2],ry=(h-xx[3])-1;
00441           GRect rect(x-rx,y-ry,2*rx,2*ry);
00442           a=GMapOval::create(rect);
00443         }else if(shape == "oval")
00444         {
00445           int xx[4];
00446           int i=0;
00447           for(GPosition rect_pos=coords;(rect_pos)&&(i<4);++rect_pos,++i)
00448           {
00449             xx[i]=coords[rect_pos];
00450           }
00451           if(i!=4)
00452           {
00453             G_THROW( ERR_MSG("XMLAnno.bad_oval") );
00454           }
00455           int xmin,xmax; 
00456           if(xx[0]>xx[2])
00457           {
00458             xmax=xx[0];
00459             xmin=xx[2];
00460           }else
00461           {
00462             xmin=xx[0];
00463             xmax=xx[2];
00464           }
00465           int ymin,ymax; 
00466           if(xx[1]>xx[3])
00467           {
00468             ymax=xx[1];
00469             ymin=xx[3];
00470           }else
00471           {
00472             ymin=xx[1];
00473             ymax=xx[3];
00474           }
00475           GRect rect(xmin,ymin,xmax-xmin,ymax-ymin);
00476           a=GMapOval::create(rect);
00477         }else if(shape == "poly")
00478         {
00479           GP<GMapPoly> p=GMapPoly::create();
00480           for(GPosition poly_pos=coords;poly_pos;++poly_pos)
00481           {
00482             int x=coords[poly_pos];
00483             if(! ++poly_pos)
00484               break;
00485             int y=coords[poly_pos];
00486             p->add_vertex(x,y);
00487           }
00488           p->close_poly();
00489           a=p;
00490         }else
00491         {
00492           G_THROW( ( ERR_MSG("XMLAnno.unknown_shape") "\t")+shape );
00493         }
00494         if(a)
00495         {
00496           GPosition pos;
00497           if((pos=args.contains("href")))
00498           {
00499             a->url=args[pos];
00500           }
00501           if((pos=args.contains("target")))
00502           {
00503             a->target=args[pos];
00504           }
00505           if((pos=args.contains("alt")))
00506           {
00507             a->comment=args[pos];
00508           }
00509           if((pos=args.contains("bordertype")))
00510           {
00511             GUTF8String b=args[pos];
00512             static const GMap<GUTF8String,GMapArea::BorderType> typeMap=BorderTypeMap();
00513             if((pos=typeMap.contains(b)))
00514             {
00515               a->border_type=typeMap[pos];
00516             }else
00517             {
00518               G_THROW( (ERR_MSG("XMLAnno.unknown_border") "\t")+b );
00519             }
00520           }
00521           a->border_always_visible=!!args.contains("visible");
00522           if((pos=args.contains("bordercolor")))
00523           {
00524             a->border_color=convertToColor(args[pos]);
00525           }
00526           if((pos=args.contains("highlight")))
00527           {
00528             a->hilite_color=convertToColor(args[pos]);
00529           }
00530           if((pos=args.contains("border")))
00531           {
00532              a->border_width=args[pos].toInt(); //atoi(args[pos]);
00533           }
00534           map_areas.append(a);
00535         }
00536       }
00537     }
00538   }
00539   dfile.set_modified(true);
00540   dfile.anno=ByteStream::create();
00541   anno.encode(dfile.anno);
00542 }
00543 
00544 GP<DjVuFile>
00545 lt_XMLParser::Impl::get_file(const GURL &url,GUTF8String id)
00546 {
00547   GP<DjVuFile> dfile;
00548   GP<DjVuDocument> doc;
00549   GCriticalSectionLock lock(&xmlparser_lock);
00550   {
00551     GPosition pos=m_docs.contains(url.get_string());
00552     if(pos)
00553     {
00554       doc=m_docs[pos];
00555     }else
00556     {
00557       doc=DjVuDocument::create_wait(url);
00558       if(! doc->wait_for_complete_init())
00559       {
00560         G_THROW(( ERR_MSG("XMLAnno.fail_init") "\t")+url.get_string() );
00561       }
00562       m_docs[url.get_string()]=doc;
00563     }
00564     if(id.is_int())
00565     {
00566       const int xpage=id.toInt(); //atoi((char const *)page); 
00567       if(xpage>0)
00568         id=doc->page_to_id(xpage-1);
00569     }else if(!id.length())
00570     { 
00571       id=doc->page_to_id(0);
00572     }
00573   }
00574   const GURL fileurl(doc->id_to_url(id));
00575   GPosition dpos(m_files.contains(fileurl.get_string()));
00576   if(!dpos)
00577   {
00578     if(!doc->get_id_list().contains(id))
00579     {
00580       G_THROW( ERR_MSG("XMLAnno.bad_page") );
00581     }
00582     dfile=doc->get_djvu_file(id,false);
00583     if(!dfile)
00584     {
00585       G_THROW( ERR_MSG("XMLAnno.bad_page") );
00586     }
00587     m_files[fileurl.get_string()]=dfile;
00588   }else
00589   {
00590     dfile=m_files[dpos];
00591   }
00592   return dfile;
00593 }
00594   
00595 void
00596 lt_XMLParser::Impl::parse(const lt_XMLTags &tags)
00597 {
00598   const GPList<lt_XMLTags> Body(tags.get_Tags(bodytag));
00599   GPosition pos=Body;
00600  
00601   if(!pos || (pos != Body.lastpos()))
00602   {
00603     G_THROW( ERR_MSG("XMLAnno.extra_body") );
00604   }
00605   const GP<lt_XMLTags> GBody(Body[pos]);
00606   if(!GBody)
00607   {
00608     G_THROW( ERR_MSG("XMLAnno.no_body") );
00609   }
00610 
00611   GMap<GUTF8String,GP<lt_XMLTags> > Maps;
00612   lt_XMLTags::get_Maps(maptag,"name",Body,Maps);
00613 
00614   const GPList<lt_XMLTags> Objects(GBody->get_Tags(objecttag));
00615   lt_XMLTags::get_Maps(maptag,"name",Objects,Maps);
00616 
00617   for(GPosition Objpos=Objects;Objpos;++Objpos)
00618   {
00619     lt_XMLTags &GObject=*Objects[Objpos];
00620     // Map of attributes to value (e.g. "width" --> "500")
00621     const GMap<GUTF8String,GUTF8String> &args=GObject.get_args();
00622     GURL codebase;
00623     {
00624       DEBUG_MSG("Setting up codebase... m_codebase = " << m_codebase << "\n");
00625       GPosition codebasePos=args.contains("codebase");
00626       // If user specified a codebase attribute, assume it is correct (absolute URL):
00627       //  the GURL constructor will throw an exception if it isn't
00628       if(codebasePos)
00629       {
00630         codebase=GURL::UTF8(args[codebasePos]);
00631       }else if (m_codebase.is_dir())
00632       {
00633         codebase=m_codebase;
00634       }else
00635       {
00636         codebase=GURL::Filename::UTF8(GOS::cwd());
00637       }
00638       DEBUG_MSG("codebase = " << codebase << "\n");
00639     }
00640     // the data attribute specifies the input file.  This can be
00641     //  either an absolute URL (starts with file:/) or a relative
00642     //  URL (for now, just a path and file name).  If it's absolute,
00643     //  our GURL will adequately wrap it.  If it's relative, we need
00644     //  to use the codebase attribute to form an absolute URL first.
00645     GPosition datapos=args.contains("data");
00646     if(datapos)
00647     {
00648       bool isDjVuType=false;
00649       GPosition typePos(args.contains("type"));
00650       if(typePos)
00651       {
00652         if(args[typePos] != mimetype)
00653         {
00654 //          DjVuPrintErrorUTF8("Ignoring %s Object tag\n",mimetype);
00655           continue;
00656         }
00657         isDjVuType=true;
00658       }
00659       const GURL url=GURL::UTF8(args[datapos],(args[datapos][0] == '/')?codebase.base():codebase);
00660       int width;
00661       {
00662         GPosition widthPos=args.contains("width");
00663         width=(widthPos)?args[widthPos].toInt():0;
00664       }
00665       int height;
00666       {
00667         GPosition heightPos=args.contains("height");
00668         height=(heightPos)?args[heightPos].toInt():0;
00669       }
00670       GUTF8String gamma;
00671       GUTF8String dpi;
00672       GUTF8String page;
00673       GUTF8String do_ocr;
00674       {
00675         GPosition paramPos(GObject.contains(paramtag));
00676         if(paramPos)
00677         {
00678           const GPList<lt_XMLTags> Params(GObject[paramPos]);
00679           for(GPosition loc=Params;loc;++loc)
00680           {
00681             const GMap<GUTF8String,GUTF8String> &pargs=Params[loc]->get_args();
00682             GPosition namepos=pargs.contains("name");
00683             if(namepos)
00684             {
00685               GPosition valuepos=pargs.contains("value");
00686               if(valuepos)
00687               {
00688                 const GUTF8String name=pargs[namepos].downcase();
00689                 const GUTF8String &value=pargs[valuepos];
00690                 if(name == "flags")
00691                 {
00692                   GMap<GUTF8String,GUTF8String> args;
00693                   lt_XMLTags::ParseValues(value,args,true);
00694                   if(args.contains("page"))
00695                   {
00696                     page=args["page"];
00697                   }
00698                   if(args.contains("dpi"))
00699                   {
00700                     dpi=args["dpi"];
00701                   }
00702                   if(args.contains("gamma"))
00703                   {
00704                     gamma=args["gamma"];
00705                   }
00706                   if(args.contains("ocr"))
00707                   {
00708                     do_ocr=args["ocr"];
00709                   }
00710                 }else if(name == "page")
00711                 {
00712                   page=value;
00713                 }else if(name == "dpi")
00714                 {
00715                   dpi=value;
00716                 }else if(name == "gamma")
00717                 {
00718                   gamma=value;
00719                 }else if(name == "ocr")
00720                 {
00721                   do_ocr=value;
00722                 }
00723               }
00724             }
00725           }
00726         }
00727       }
00728       const GP<DjVuFile> dfile(get_file(url,page));
00729       if(dpi.is_int() || gamma.is_float())
00730       {
00731         int pos=0;
00732         ChangeInfo(*dfile,dpi.toInt(),gamma.toDouble(pos,pos));
00733       }
00734       parse_anno(width,height,GObject,Maps,*dfile);
00735       parse_meta(GObject,*dfile);
00736       parse_text(width,height,GObject,*dfile);
00737       ChangeTextOCR(do_ocr,width,height,dfile);
00738     }
00739   }
00740 }
00741 
00742 void
00743 lt_XMLParser::Impl::parse_anno(
00744   const int width,
00745   const int height,
00746   const lt_XMLTags &GObject,
00747   GMap<GUTF8String,GP<lt_XMLTags> > &Maps,
00748   DjVuFile &dfile )
00749 {
00750   GP<lt_XMLTags> map;
00751   {
00752     GPosition usemappos=GObject.get_args().contains("usemap");
00753     if(usemappos)
00754     {
00755       const GUTF8String mapname(GObject.get_args()[usemappos]);
00756       GPosition mappos=Maps.contains(mapname);
00757       if(!mappos)
00758       {
00759         G_THROW((ERR_MSG("XMLAnno.map_find") "\t")+mapname );
00760       }else
00761       {
00762         map=Maps[mappos];
00763       }
00764     }
00765   }
00766   if(map)
00767   {
00768     ChangeAnno(width,height,dfile,*map);
00769   }
00770 }
00771 
00772 #ifdef max
00773 #undef max
00774 #endif
00775 template<class TYPE>
00776 static inline TYPE max(TYPE a,TYPE b) { return (a>b)?a:b; }
00777 #ifdef min
00778 #undef min
00779 #endif
00780 template<class TYPE>
00781 static inline TYPE min(TYPE a,TYPE b) { return (a<b)?a:b; }
00782 
00783 // used to build the zone tree
00784 // true is returned if the GRect is known for this object,
00785 // and false, if the rectangle's size is just the parent size.
00786 static bool
00787 make_child_layer(
00788   DjVuTXT::Zone &parent,
00789   const lt_XMLTags &tag, ByteStream &bs,
00790   const int height, const double ws, const double hs)
00791 {
00792   bool retval=true;
00793   // the plugin thinks there are only Pages, Lines and Words
00794   // so we don't make Paragraphs, Regions and Columns zones
00795   // if we did the plugin is not able to search the text but 
00796   // DjVuToText writes out all the text anyway
00797   DjVuTXT::Zone *self_ptr;
00798   char sepchar;
00799   const GUTF8String name(tag.get_name());
00800   if(name == wordtag)
00801   {
00802     self_ptr=parent.append_child();
00803     self_ptr->ztype = DjVuTXT::WORD;
00804     sepchar=' ';
00805   }else if(name == linetag)
00806   {
00807     self_ptr=parent.append_child();
00808     self_ptr->ztype = DjVuTXT::LINE;
00809     sepchar=DjVuTXT::end_of_line;
00810   }else if(name == paragraphtag)
00811   {
00812     self_ptr=parent.append_child();
00813     self_ptr->ztype = DjVuTXT::PARAGRAPH;
00814     sepchar=DjVuTXT::end_of_paragraph;
00815   }else if(name == regiontag)
00816   {
00817     self_ptr=parent.append_child();
00818     self_ptr->ztype = DjVuTXT::REGION;
00819     sepchar=DjVuTXT::end_of_region;
00820   }else if(name == pagecolumntag)
00821   {
00822     self_ptr=parent.append_child();
00823     self_ptr->ztype = DjVuTXT::COLUMN;
00824     sepchar=DjVuTXT::end_of_column;
00825   }else
00826   {
00827     self_ptr = &parent;
00828     self_ptr->ztype = DjVuTXT::PAGE;
00829     sepchar=0;
00830   }
00831   DjVuTXT::Zone &self = *self_ptr;
00832   self.text_start = bs.tell();
00833   int &xmin=self.rect.xmin, &ymin=self.rect.ymin, 
00834     &xmax=self.rect.xmax, &ymax=self.rect.ymax;
00835   GRect default_rect;
00836   default_rect.xmin=max(parent.rect.xmax,parent.rect.xmin);
00837   default_rect.xmax=min(parent.rect.xmax,parent.rect.xmin);
00838   default_rect.ymin=max(parent.rect.ymax,parent.rect.ymin);
00839   default_rect.ymax=min(parent.rect.ymax,parent.rect.ymin);
00840   // Now if there are coordinates, use those.
00841   GPosition pos(tag.get_args().contains("coords"));
00842   if(pos)
00843   {
00844     GList<int> rectArgs;
00845     intList(tag.get_args()[pos], rectArgs);
00846     if((pos=rectArgs))
00847     {
00848       xmin=(int)(ws*(double)rectArgs[pos]);
00849       if(++pos)
00850       {
00851         ymin=(height-1)-(int)(hs*(double)rectArgs[pos]);
00852         if(++pos)
00853         {
00854           xmax=(int)(ws*(double)rectArgs[pos]);
00855           if(++pos)
00856           {
00857             ymax=(height-1)-(int)(hs*(double)rectArgs[pos]);
00858             if(xmin>xmax) // Make sure xmin is really minimum
00859             {
00860               const int t=xmin;
00861               xmin=xmax;
00862               xmax=t;
00863             }
00864             if(ymin>ymax) // Make sure ymin is really minimum
00865             {
00866               const int t=ymin;
00867               ymin=ymax;
00868               ymax=t;
00869             }
00870           }
00871         }
00872       }
00873     }
00874   }
00875   if(self.ztype == DjVuTXT::WORD)
00876   {
00877     if(! pos)
00878     {
00879       self.rect=default_rect;
00880       retval=false;
00881     }
00882     const GUTF8String raw(tag.get_raw().fromEscaped());
00883     const int i=raw.nextNonSpace(0);
00884     bs.writestring(raw.substr(i,raw.firstEndSpace(i)-i));
00885     if(sepchar)
00886       bs.write8(sepchar);
00887     self.text_length = bs.tell() - self.text_start;
00888   }else if(pos)
00889   {
00890     pos=tag.get_content();
00891     if(pos)
00892     {
00893       for(pos=tag.get_content(); pos; ++pos)
00894       {
00895         const GP<lt_XMLTags> t(tag.get_content()[pos].tag);
00896         make_child_layer(self, *t, bs, height,ws,hs);
00897       }
00898       if(sepchar)
00899         bs.write8(sepchar);
00900       self.text_length = bs.tell() - self.text_start;
00901     }else
00902     {
00903       const GUTF8String raw(tag.get_raw().fromEscaped());
00904       const int i=raw.nextNonSpace(0);
00905       bs.writestring(raw.substr(i,raw.firstEndSpace(i)-i));
00906       if(sepchar)
00907         bs.write8(sepchar);
00908       self.text_length = bs.tell() - self.text_start;
00909     }
00910   }else
00911   {
00912     self.rect=default_rect;
00913     if((pos=tag.get_content()))
00914     {
00915       do
00916       {
00917         const GP<lt_XMLTags> t(tag.get_content()[pos].tag);
00918         const GRect save_rect(self.rect);
00919         self.rect=default_rect;
00920         if(retval=make_child_layer(self, *t, bs, height,ws,hs))
00921         {
00922           xmin=min(save_rect.xmin,xmin);
00923           xmax=max(save_rect.xmax,xmax);
00924           ymin=min(save_rect.ymin,ymin);
00925           ymax=max(save_rect.ymax,ymax);
00926         }else
00927         {
00928           // If the child doesn't have coordinates, we need to use a box
00929           // at least as big as the parent's coordinates.
00930           xmin=min(save_rect.xmin,default_rect.xmax);
00931           xmax=max(save_rect.xmax,default_rect.xmin);
00932           ymin=min(save_rect.ymin,default_rect.ymax);
00933           ymax=max(save_rect.ymax,default_rect.ymin);
00934           for(; pos; ++pos)
00935           {
00936             const GP<lt_XMLTags> t(tag.get_content()[pos].tag);
00937             make_child_layer(self, *t, bs, height,ws,hs);
00938           }
00939           break;
00940         }
00941       } while(++pos);
00942       if(sepchar)
00943         bs.write8(sepchar);
00944       self.text_length = bs.tell() - self.text_start;
00945     }else
00946     {
00947       const GUTF8String raw(tag.get_raw().fromEscaped());
00948       const int i=raw.nextNonSpace(0);
00949       bs.writestring(raw.substr(i,raw.firstEndSpace(i)-i));
00950       if(sepchar)
00951         bs.write8(sepchar);
00952       self.text_length = bs.tell() - self.text_start;
00953     }
00954   }
00955   parent.rect.xmin=min(xmin,parent.rect.xmin);
00956   parent.rect.ymin=min(ymin,parent.rect.ymin);
00957   parent.rect.xmax=max(xmax,parent.rect.xmax);
00958   parent.rect.ymax=max(ymax,parent.rect.ymax);
00959   if(xmin>xmax)
00960   {
00961     const int t=xmin;
00962     xmin=xmax;
00963     xmax=t;
00964   }
00965   if(ymin>ymax)
00966   {
00967     const int t=ymin;
00968     ymin=ymax;
00969     ymax=t;
00970   }
00971 //  DjVuPrintMessage("(%d,%d)(%d,%d)<<<\\%o>>>\n",
00972 //    xmin,ymin,xmax,ymax, sepchar);
00973   return retval;
00974 }
00975 
00976 void 
00977 lt_XMLParser::Impl::ChangeTextOCR(
00978   const GUTF8String &value,
00979   const int width,
00980   const int height,
00981   const GP<DjVuFile> &dfile)
00982 {
00983   if(value.length() && value.downcase() != "false")
00984   {
00985     const GP<ByteStream> bs=OCRcallback(value,DjVuImage::create(dfile));
00986     if( bs && bs->size() )
00987     {
00988       const GP<lt_XMLTags> tags(lt_XMLTags::create(bs));
00989       ChangeText(width,height,*dfile,*tags);
00990     }
00991   }
00992 }
00993 
00994 void 
00995 lt_XMLParser::Impl::ChangeMeta(
00996   DjVuFile &dfile, const lt_XMLTags &tags )
00997 {
00998   dfile.resume_decode(true);
00999   GP<ByteStream> gbs(ByteStream::create());
01000   tags.write(*gbs,false);
01001   gbs->seek(0L);
01002   GUTF8String raw(gbs->getAsUTF8());
01003   if(raw.length())
01004   {
01005      //GUTF8String gs="<"+(metadatatag+(">"+raw))+"</"+metadatatag+">\n");
01006     dfile.change_meta(raw+"\n");
01007   }else
01008   {
01009     dfile.change_meta(GUTF8String());
01010   }
01011 }
01012 
01013 void 
01014 lt_XMLParser::Impl::ChangeText(
01015   const int width, const int height,
01016   DjVuFile &dfile, const lt_XMLTags &tags )
01017 {
01018   dfile.resume_decode(true);
01019   
01020   GP<DjVuText> text = DjVuText::create();
01021   GP<DjVuTXT> txt = text->txt = DjVuTXT::create();
01022   
01023   // to store the new text
01024   GP<ByteStream> textbs = ByteStream::create(); 
01025   
01026   GP<DjVuInfo> info=(dfile.info);
01027   if(info)
01028   {
01029     const int h=info->height;
01030     const int w=info->width;
01031     txt->page_zone.text_start = 0;
01032     DjVuTXT::Zone &parent=txt->page_zone;
01033     parent.rect.xmin=0;
01034     parent.rect.ymin=0;
01035     parent.rect.ymax=h;
01036     parent.rect.xmax=w;
01037     double ws=1.0;
01038     if(width && width != w)
01039     {
01040       ws=((double)w)/((double)width);
01041     }
01042     double hs=1.0;
01043     if(height && height != h)
01044     {
01045       hs=((double)h)/((double)height);
01046     }
01047     make_child_layer(parent, tags, *textbs, h, ws,hs);
01048     textbs->write8(0);
01049     long len = textbs->tell();
01050     txt->page_zone.text_length = len;
01051     textbs->seek(0,SEEK_SET);
01052     textbs->read(txt->textUTF8.getbuf(len), len);
01053   
01054     dfile.change_text(txt,false);
01055   }
01056 }
01057 
01058 void
01059 lt_XMLParser::Impl::parse_text(
01060   const int width,
01061   const int height,
01062   const lt_XMLTags &GObject,
01063   DjVuFile &dfile )
01064 {
01065   GPosition textPos = GObject.contains(hiddentexttag);
01066   if(textPos)
01067   {
01068     // loop through the hidden text - there should only be one 
01069     // if there are more ??only the last one will be saved??
01070     GPList<lt_XMLTags> textTags = GObject[textPos];
01071     GPosition pos = textTags;
01072     ChangeText(width,height,dfile,*textTags[pos]);
01073   }
01074 }
01075 
01076 void
01077 lt_XMLParser::Impl::parse_meta(
01078   const lt_XMLTags &GObject,
01079   DjVuFile &dfile )
01080 {
01081   GPosition metaPos = GObject.contains(metadatatag);
01082   if(metaPos)
01083   {
01084     // loop through the hidden text - there should only be one 
01085     // if there are more ??only the last one will be saved??
01086     GPList<lt_XMLTags> metaTags = GObject[metaPos];
01087     GPosition pos = metaTags;
01088     ChangeMeta(dfile,*metaTags[pos]);
01089   }
01090 }
01091 
01092 static GP<ByteStream>
01093 OCRcallback(
01094   void * const xarg,
01095   lt_XMLParser::mapOCRcallback * const xcallback,
01096   const GUTF8String &value,
01097   const GP<DjVuImage> &image )
01098 {
01099   GP<ByteStream> retval;
01100   static void *arg=0;
01101   static lt_XMLParser::mapOCRcallback *callback=0;
01102   if(image)
01103   {
01104     if(callback)
01105       retval=callback(arg,value,image);
01106   }else
01107   {
01108     arg=xarg;
01109     callback=xcallback;
01110   }
01111   return retval;
01112 }
01113 
01114 void
01115 lt_XMLParser::setOCRcallback(
01116   void * const arg,
01117   mapOCRcallback * const callback)
01118 {
01119   ::OCRcallback(arg,callback);
01120 }
01121 
01122 
01123 #ifdef HAVE_NAMESPACES
01124 }
01125 # ifndef NOT_USING_DJVU_NAMESPACE
01126 using namespace DJVU;
01127 # endif
01128 #endif

kviewshell

Skip menu "kviewshell"
  • Main Page
  • Namespace List
  • Class Hierarchy
  • Alphabetical List
  • Class List
  • File List
  • Namespace Members
  • Class Members

API Reference

Skip menu "API Reference"
  • kviewshell
Generated for API Reference by doxygen 1.5.9
This website is maintained by Adriaan de Groot and Allen Winter.
KDE® and the K Desktop Environment® logo are registered trademarks of KDE e.V. | Legal