00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00016
00017
00018
00019
00020
00021
00022
00023
00024
00025
00026
00027
00028
00029
00030
00031
00032
00033
00034
00035
00036
00037
00038
00039
00040
00041
00042
00043
00044
00045
00046
00047
00048
00049
00050
00051
00052
00053
00054
00055
00056
00057 #ifdef HAVE_CONFIG_H
00058 # include "config.h"
00059 #endif
00060 #if NEED_GNUG_PRAGMAS
00061 # pragma implementation
00062 #endif
00063
00064
00065
00066
00067 #include "XMLParser.h"
00068 #include "XMLTags.h"
00069 #include "ByteStream.h"
00070 #include "GOS.h"
00071 #include "DjVuDocument.h"
00072 #include "DjVuText.h"
00073 #include "DjVuAnno.h"
00074 #include "DjVuFile.h"
00075 #include "DjVuImage.h"
00076 #include "debug.h"
00077 #include <stdio.h>
00078 #include <ctype.h>
00079 #include <stdlib.h>
00080
00081
00082 #ifdef HAVE_NAMESPACES
00083 namespace DJVU {
00084 # ifdef NOT_DEFINED // Just to fool emacs c++ mode
00085 }
00086 #endif
00087 #endif
00088
00089 static const char mimetype[]="image/x.djvu";
00090 static const char bodytag[]="BODY";
00091 static const char areatag[]="AREA";
00092 static const char maptag[]="MAP";
00093 static const char objecttag[]="OBJECT";
00094 static const char paramtag[]="PARAM";
00095 static const char wordtag[]="WORD";
00096 static const char linetag[]="LINE";
00097 static const char paragraphtag[]="PARAGRAPH";
00098 static const char regiontag[]="REGION";
00099 static const char pagecolumntag[]="PAGECOLUMN";
00100 static const char hiddentexttag[]="HIDDENTEXT";
00101 static const char metadatatag[]="METADATA";
00102
00103 class lt_XMLParser::Impl : public lt_XMLParser
00104 {
00105 public:
00106 Impl(void);
00107 virtual ~Impl();
00109 virtual void parse(const GP<ByteStream> &bs);
00111 virtual void parse(const lt_XMLTags &tags);
00113 virtual void save(void);
00115 virtual void empty(void);
00116 protected:
00117 GP<DjVuFile> get_file(const GURL &url,GUTF8String page);
00118
00119 void parse_anno(const int width, const int height,
00120 const lt_XMLTags &GObject,
00121 GMap<GUTF8String,GP<lt_XMLTags> > &Maps, DjVuFile &dfile);
00122
00123 void parse_text(const int width, const int height,
00124 const lt_XMLTags &GObject, DjVuFile &dfile);
00125
00126 void parse_meta(const lt_XMLTags &GObject, DjVuFile &dfile);
00127
00128 void ChangeAnno( const int width, const int height,
00129 DjVuFile &dfile, const lt_XMLTags &map);
00130
00131 void ChangeInfo(DjVuFile &dfile,const int dpi,const double gamma);
00132
00133 void ChangeText( const int width, const int height,
00134 DjVuFile &dfile, const lt_XMLTags &map);
00135
00136 void ChangeMeta( DjVuFile &dfile, const lt_XMLTags &map);
00137
00138 void ChangeTextOCR( const GUTF8String &value,
00139 const int width, const int height,
00140 const GP<DjVuFile> &dfile);
00141
00142
00143
00144
00145 GMap<GUTF8String,GP<DjVuFile> > m_files;
00146 GMap<GUTF8String,GP<DjVuDocument> > m_docs;
00147
00148 GURL m_codebase;
00149 GCriticalSection xmlparser_lock;
00150 };
00151
00152 static GP<ByteStream>
00153 OCRcallback(
00154 void * const xarg,
00155 lt_XMLParser::mapOCRcallback * const xcallback,
00156 const GUTF8String &value=GUTF8String(),
00157 const GP<DjVuImage> &image=0 );
00158
00159 static inline GP<ByteStream>
00160 OCRcallback(const GUTF8String &value, const GP<DjVuImage> &image)
00161 {
00162 return OCRcallback(0,0,value,image);
00163 }
00164
00165 lt_XMLParser::lt_XMLParser() {}
00166 lt_XMLParser::~lt_XMLParser() {}
00167 lt_XMLParser::Impl::Impl() {}
00168 lt_XMLParser::Impl::~Impl() {}
00169
00170 GP<lt_XMLParser>
00171 lt_XMLParser::create(void)
00172 {
00173 return new lt_XMLParser::Impl;
00174 }
00175
00176
00177 static void
00178 intList(GUTF8String coords, GList<int> &retval)
00179 {
00180 int pos=0;
00181 while(coords.length())
00182 {
00183 int epos;
00184 unsigned long i=coords.toLong(pos,epos,10);
00185 if(epos>=0)
00186 {
00187 retval.append(i);
00188 const int n=coords.nextNonSpace(epos);
00189 if(coords[n] != ',')
00190 break;
00191 pos=n+1;
00192 }
00193 }
00194 }
00195
00196 void
00197 lt_XMLParser::Impl::empty(void)
00198 {
00199 GCriticalSectionLock lock(&xmlparser_lock);
00200 m_files.empty();
00201 m_docs.empty();
00202 }
00203
00204 void
00205 lt_XMLParser::Impl::save(void)
00206 {
00207 GCriticalSectionLock lock(&xmlparser_lock);
00208 for(GPosition pos=m_docs;pos;++pos)
00209 {
00210 const GP<DjVuDocument> doc(m_docs[pos]);
00211 const GURL url=doc->get_init_url();
00212
00213 DEBUG_MSG("Saving "<<(const char *)url<<" with new text and annotations\n");
00214 const bool bundle=doc->is_bundled()||(doc->get_doc_type()==DjVuDocument::SINGLE_PAGE);
00215 doc->save_as(url,bundle);
00216 }
00217 empty();
00218 }
00219
00220 void
00221 lt_XMLParser::Impl::parse(const GP<ByteStream> &bs)
00222 {
00223 const GP<lt_XMLTags> tags(lt_XMLTags::create(bs));
00224 parse(*tags);
00225 }
00226
00227 static const GMap<GUTF8String,GMapArea::BorderType> &
00228 BorderTypeMap(void)
00229 {
00230 static GMap<GUTF8String,GMapArea::BorderType> typeMap;
00231 if (! typeMap.size())
00232 {
00233 typeMap["none"]=GMapArea::NO_BORDER;
00234 typeMap["xor"]=GMapArea::XOR_BORDER;
00235 typeMap["solid"]=GMapArea::SOLID_BORDER;
00236 typeMap["default"]=GMapArea::SOLID_BORDER;
00237 typeMap["shadowout"]=GMapArea::SHADOW_OUT_BORDER;
00238 typeMap["shadowin"]=GMapArea::SHADOW_IN_BORDER;
00239 typeMap["etchedin"]=GMapArea::SHADOW_EIN_BORDER;
00240 typeMap["etchedout"]=GMapArea::SHADOW_EOUT_BORDER;
00241 }
00242 return typeMap;
00243 }
00244
00245 static unsigned long
00246 convertToColor(const GUTF8String &s)
00247 {
00248 unsigned long retval=0;
00249 if(s.length())
00250 {
00251 int endpos;
00252 if(s[0] == '#')
00253 {
00254 retval=s.substr(1,-1).toULong(0,endpos,16);
00255 }
00256 if(endpos < 0)
00257 {
00258 G_THROW( (ERR_MSG("XMLAnno.bad_color") "\t")+s );
00259 }
00260 }
00261 return retval;
00262 }
00263
00264 void
00265 lt_XMLParser::Impl::ChangeInfo(DjVuFile &dfile,const int dpi,const double gamma)
00266 {
00267 GP<DjVuInfo> info;
00268 if(dpi >= 5 && dpi <= 4800)
00269 {
00270 dfile.resume_decode(true);
00271 if(dfile.info && (dpi != dfile.info->dpi) )
00272 {
00273 info=new DjVuInfo(*dfile.info);
00274 info->dpi=dpi;
00275 }
00276 }
00277 if(gamma >= 0.1 && gamma <= 5.0)
00278 {
00279 dfile.resume_decode(true);
00280 if(dfile.info && (gamma != dfile.info->gamma) )
00281 {
00282 if(!info)
00283 info=new DjVuInfo(*dfile.info);
00284 info->gamma=gamma;
00285 }
00286 }
00287 if(info)
00288 {
00289 dfile.change_info(info);
00290 }
00291 }
00292
00293 void
00294 lt_XMLParser::Impl::ChangeAnno(
00295 const int width, const int height,
00296 DjVuFile &dfile,
00297 const lt_XMLTags &map )
00298 {
00299 dfile.resume_decode(true);
00300 const GP<DjVuInfo> info(dfile.info);
00301 const GP<DjVuAnno> ganno(DjVuAnno::create());
00302 DjVuAnno &anno=*ganno;
00303 GPosition map_pos;
00304 map_pos=map.contains(areatag);
00305 if(dfile.contains_anno())
00306 {
00307 GP<ByteStream> annobs=dfile.get_merged_anno();
00308 if(annobs)
00309 {
00310 anno.decode(annobs);
00311 if(anno.ant && info)
00312 {
00313 anno.ant->map_areas.empty();
00314 }
00315 }
00316
00317 }
00318 if(info && map_pos)
00319 {
00320 const int h=info->height;
00321 const int w=info->width;
00322 double ws=1.0;
00323 double hs=1.0;
00324 if(width && width != w)
00325 {
00326 ws=((double)w)/((double)width);
00327 }
00328 if(height && height != h)
00329 {
00330 hs=((double)h)/((double)height);
00331 }
00332 if(!anno.ant)
00333 {
00334 anno.ant=DjVuANT::create();
00335 }
00336 GPList<GMapArea> &map_areas=anno.ant->map_areas;
00337 map_areas.empty();
00338 GPList<lt_XMLTags> gareas=map[map_pos];
00339 for(GPosition pos=gareas;pos;++pos)
00340 {
00341 if(gareas[pos])
00342 {
00343 lt_XMLTags &areas=*(gareas[pos]);
00344 GMap<GUTF8String,GUTF8String> args(areas.get_args());
00345 GList<int> coords;
00346
00347
00348
00349
00350
00351
00352 {
00353 GPosition coords_pos=args.contains("coords");
00354 if(coords_pos)
00355 {
00356 GList<int> raw_coords;
00357 intList(args[coords_pos],raw_coords);
00358 for(GPosition raw_pos=raw_coords;raw_pos;++raw_pos)
00359 {
00360 const int r=raw_coords[raw_pos];
00361 const int x=(int)(ws*(double)r+0.5);
00362 coords.append(x);
00363 int y=h-1;
00364 if(! ++raw_pos)
00365 {
00366 y-=(int)(hs*(double)r+0.5);
00367 }else
00368 {
00369 y-=(int)(hs*(double)raw_coords[raw_pos]+0.5);
00370 }
00371 coords.append(y);
00372
00373 }
00374 }
00375 }
00376 GUTF8String shape;
00377 {
00378 GPosition shape_pos=args.contains("shape");
00379 if(shape_pos)
00380 {
00381 shape=args[shape_pos];
00382 }
00383 }
00384 GP<GMapArea> a;
00385 if(shape == "default")
00386 {
00387 GRect rect(0,0,w,h);
00388 a=GMapRect::create(rect);
00389 }else if(!shape.length() || shape == "rect")
00390 {
00391 int xx[4];
00392 int i=0;
00393 for(GPosition rect_pos=coords;(rect_pos)&&(i<4);++rect_pos,++i)
00394 {
00395 xx[i]=coords[rect_pos];
00396 }
00397 if(i!=4)
00398 {
00399 G_THROW( ERR_MSG("XMLAnno.bad_rect") );
00400 }
00401 int xmin,xmax;
00402 if(xx[0]>xx[2])
00403 {
00404 xmax=xx[0];
00405 xmin=xx[2];
00406 }else
00407 {
00408 xmin=xx[0];
00409 xmax=xx[2];
00410 }
00411 int ymin,ymax;
00412 if(xx[1]>xx[3])
00413 {
00414 ymax=xx[1];
00415 ymin=xx[3];
00416 }else
00417 {
00418 ymin=xx[1];
00419 ymax=xx[3];
00420 }
00421 GRect rect(xmin,ymin,xmax-xmin,ymax-ymin);
00422 a=GMapRect::create(rect);
00423 }else if(shape == "circle")
00424 {
00425 int xx[4];
00426 int i=0;
00427 GPosition rect_pos=coords.lastpos();
00428 if(rect_pos)
00429 {
00430 coords.append(coords[rect_pos]);
00431 for(rect_pos=coords;(rect_pos)&&(i<4);++rect_pos)
00432 {
00433 xx[i++]=coords[rect_pos];
00434 }
00435 }
00436 if(i!=4)
00437 {
00438 G_THROW( ERR_MSG("XMLAnno.bad_circle") );
00439 }
00440 int x=xx[0],y=xx[1],rx=xx[2],ry=(h-xx[3])-1;
00441 GRect rect(x-rx,y-ry,2*rx,2*ry);
00442 a=GMapOval::create(rect);
00443 }else if(shape == "oval")
00444 {
00445 int xx[4];
00446 int i=0;
00447 for(GPosition rect_pos=coords;(rect_pos)&&(i<4);++rect_pos,++i)
00448 {
00449 xx[i]=coords[rect_pos];
00450 }
00451 if(i!=4)
00452 {
00453 G_THROW( ERR_MSG("XMLAnno.bad_oval") );
00454 }
00455 int xmin,xmax;
00456 if(xx[0]>xx[2])
00457 {
00458 xmax=xx[0];
00459 xmin=xx[2];
00460 }else
00461 {
00462 xmin=xx[0];
00463 xmax=xx[2];
00464 }
00465 int ymin,ymax;
00466 if(xx[1]>xx[3])
00467 {
00468 ymax=xx[1];
00469 ymin=xx[3];
00470 }else
00471 {
00472 ymin=xx[1];
00473 ymax=xx[3];
00474 }
00475 GRect rect(xmin,ymin,xmax-xmin,ymax-ymin);
00476 a=GMapOval::create(rect);
00477 }else if(shape == "poly")
00478 {
00479 GP<GMapPoly> p=GMapPoly::create();
00480 for(GPosition poly_pos=coords;poly_pos;++poly_pos)
00481 {
00482 int x=coords[poly_pos];
00483 if(! ++poly_pos)
00484 break;
00485 int y=coords[poly_pos];
00486 p->add_vertex(x,y);
00487 }
00488 p->close_poly();
00489 a=p;
00490 }else
00491 {
00492 G_THROW( ( ERR_MSG("XMLAnno.unknown_shape") "\t")+shape );
00493 }
00494 if(a)
00495 {
00496 GPosition pos;
00497 if((pos=args.contains("href")))
00498 {
00499 a->url=args[pos];
00500 }
00501 if((pos=args.contains("target")))
00502 {
00503 a->target=args[pos];
00504 }
00505 if((pos=args.contains("alt")))
00506 {
00507 a->comment=args[pos];
00508 }
00509 if((pos=args.contains("bordertype")))
00510 {
00511 GUTF8String b=args[pos];
00512 static const GMap<GUTF8String,GMapArea::BorderType> typeMap=BorderTypeMap();
00513 if((pos=typeMap.contains(b)))
00514 {
00515 a->border_type=typeMap[pos];
00516 }else
00517 {
00518 G_THROW( (ERR_MSG("XMLAnno.unknown_border") "\t")+b );
00519 }
00520 }
00521 a->border_always_visible=!!args.contains("visible");
00522 if((pos=args.contains("bordercolor")))
00523 {
00524 a->border_color=convertToColor(args[pos]);
00525 }
00526 if((pos=args.contains("highlight")))
00527 {
00528 a->hilite_color=convertToColor(args[pos]);
00529 }
00530 if((pos=args.contains("border")))
00531 {
00532 a->border_width=args[pos].toInt();
00533 }
00534 map_areas.append(a);
00535 }
00536 }
00537 }
00538 }
00539 dfile.set_modified(true);
00540 dfile.anno=ByteStream::create();
00541 anno.encode(dfile.anno);
00542 }
00543
00544 GP<DjVuFile>
00545 lt_XMLParser::Impl::get_file(const GURL &url,GUTF8String id)
00546 {
00547 GP<DjVuFile> dfile;
00548 GP<DjVuDocument> doc;
00549 GCriticalSectionLock lock(&xmlparser_lock);
00550 {
00551 GPosition pos=m_docs.contains(url.get_string());
00552 if(pos)
00553 {
00554 doc=m_docs[pos];
00555 }else
00556 {
00557 doc=DjVuDocument::create_wait(url);
00558 if(! doc->wait_for_complete_init())
00559 {
00560 G_THROW(( ERR_MSG("XMLAnno.fail_init") "\t")+url.get_string() );
00561 }
00562 m_docs[url.get_string()]=doc;
00563 }
00564 if(id.is_int())
00565 {
00566 const int xpage=id.toInt();
00567 if(xpage>0)
00568 id=doc->page_to_id(xpage-1);
00569 }else if(!id.length())
00570 {
00571 id=doc->page_to_id(0);
00572 }
00573 }
00574 const GURL fileurl(doc->id_to_url(id));
00575 GPosition dpos(m_files.contains(fileurl.get_string()));
00576 if(!dpos)
00577 {
00578 if(!doc->get_id_list().contains(id))
00579 {
00580 G_THROW( ERR_MSG("XMLAnno.bad_page") );
00581 }
00582 dfile=doc->get_djvu_file(id,false);
00583 if(!dfile)
00584 {
00585 G_THROW( ERR_MSG("XMLAnno.bad_page") );
00586 }
00587 m_files[fileurl.get_string()]=dfile;
00588 }else
00589 {
00590 dfile=m_files[dpos];
00591 }
00592 return dfile;
00593 }
00594
00595 void
00596 lt_XMLParser::Impl::parse(const lt_XMLTags &tags)
00597 {
00598 const GPList<lt_XMLTags> Body(tags.get_Tags(bodytag));
00599 GPosition pos=Body;
00600
00601 if(!pos || (pos != Body.lastpos()))
00602 {
00603 G_THROW( ERR_MSG("XMLAnno.extra_body") );
00604 }
00605 const GP<lt_XMLTags> GBody(Body[pos]);
00606 if(!GBody)
00607 {
00608 G_THROW( ERR_MSG("XMLAnno.no_body") );
00609 }
00610
00611 GMap<GUTF8String,GP<lt_XMLTags> > Maps;
00612 lt_XMLTags::get_Maps(maptag,"name",Body,Maps);
00613
00614 const GPList<lt_XMLTags> Objects(GBody->get_Tags(objecttag));
00615 lt_XMLTags::get_Maps(maptag,"name",Objects,Maps);
00616
00617 for(GPosition Objpos=Objects;Objpos;++Objpos)
00618 {
00619 lt_XMLTags &GObject=*Objects[Objpos];
00620
00621 const GMap<GUTF8String,GUTF8String> &args=GObject.get_args();
00622 GURL codebase;
00623 {
00624 DEBUG_MSG("Setting up codebase... m_codebase = " << m_codebase << "\n");
00625 GPosition codebasePos=args.contains("codebase");
00626
00627
00628 if(codebasePos)
00629 {
00630 codebase=GURL::UTF8(args[codebasePos]);
00631 }else if (m_codebase.is_dir())
00632 {
00633 codebase=m_codebase;
00634 }else
00635 {
00636 codebase=GURL::Filename::UTF8(GOS::cwd());
00637 }
00638 DEBUG_MSG("codebase = " << codebase << "\n");
00639 }
00640
00641
00642
00643
00644
00645 GPosition datapos=args.contains("data");
00646 if(datapos)
00647 {
00648 bool isDjVuType=false;
00649 GPosition typePos(args.contains("type"));
00650 if(typePos)
00651 {
00652 if(args[typePos] != mimetype)
00653 {
00654
00655 continue;
00656 }
00657 isDjVuType=true;
00658 }
00659 const GURL url=GURL::UTF8(args[datapos],(args[datapos][0] == '/')?codebase.base():codebase);
00660 int width;
00661 {
00662 GPosition widthPos=args.contains("width");
00663 width=(widthPos)?args[widthPos].toInt():0;
00664 }
00665 int height;
00666 {
00667 GPosition heightPos=args.contains("height");
00668 height=(heightPos)?args[heightPos].toInt():0;
00669 }
00670 GUTF8String gamma;
00671 GUTF8String dpi;
00672 GUTF8String page;
00673 GUTF8String do_ocr;
00674 {
00675 GPosition paramPos(GObject.contains(paramtag));
00676 if(paramPos)
00677 {
00678 const GPList<lt_XMLTags> Params(GObject[paramPos]);
00679 for(GPosition loc=Params;loc;++loc)
00680 {
00681 const GMap<GUTF8String,GUTF8String> &pargs=Params[loc]->get_args();
00682 GPosition namepos=pargs.contains("name");
00683 if(namepos)
00684 {
00685 GPosition valuepos=pargs.contains("value");
00686 if(valuepos)
00687 {
00688 const GUTF8String name=pargs[namepos].downcase();
00689 const GUTF8String &value=pargs[valuepos];
00690 if(name == "flags")
00691 {
00692 GMap<GUTF8String,GUTF8String> args;
00693 lt_XMLTags::ParseValues(value,args,true);
00694 if(args.contains("page"))
00695 {
00696 page=args["page"];
00697 }
00698 if(args.contains("dpi"))
00699 {
00700 dpi=args["dpi"];
00701 }
00702 if(args.contains("gamma"))
00703 {
00704 gamma=args["gamma"];
00705 }
00706 if(args.contains("ocr"))
00707 {
00708 do_ocr=args["ocr"];
00709 }
00710 }else if(name == "page")
00711 {
00712 page=value;
00713 }else if(name == "dpi")
00714 {
00715 dpi=value;
00716 }else if(name == "gamma")
00717 {
00718 gamma=value;
00719 }else if(name == "ocr")
00720 {
00721 do_ocr=value;
00722 }
00723 }
00724 }
00725 }
00726 }
00727 }
00728 const GP<DjVuFile> dfile(get_file(url,page));
00729 if(dpi.is_int() || gamma.is_float())
00730 {
00731 int pos=0;
00732 ChangeInfo(*dfile,dpi.toInt(),gamma.toDouble(pos,pos));
00733 }
00734 parse_anno(width,height,GObject,Maps,*dfile);
00735 parse_meta(GObject,*dfile);
00736 parse_text(width,height,GObject,*dfile);
00737 ChangeTextOCR(do_ocr,width,height,dfile);
00738 }
00739 }
00740 }
00741
00742 void
00743 lt_XMLParser::Impl::parse_anno(
00744 const int width,
00745 const int height,
00746 const lt_XMLTags &GObject,
00747 GMap<GUTF8String,GP<lt_XMLTags> > &Maps,
00748 DjVuFile &dfile )
00749 {
00750 GP<lt_XMLTags> map;
00751 {
00752 GPosition usemappos=GObject.get_args().contains("usemap");
00753 if(usemappos)
00754 {
00755 const GUTF8String mapname(GObject.get_args()[usemappos]);
00756 GPosition mappos=Maps.contains(mapname);
00757 if(!mappos)
00758 {
00759 G_THROW((ERR_MSG("XMLAnno.map_find") "\t")+mapname );
00760 }else
00761 {
00762 map=Maps[mappos];
00763 }
00764 }
00765 }
00766 if(map)
00767 {
00768 ChangeAnno(width,height,dfile,*map);
00769 }
00770 }
00771
00772 #ifdef max
00773 #undef max
00774 #endif
00775 template<class TYPE>
00776 static inline TYPE max(TYPE a,TYPE b) { return (a>b)?a:b; }
00777 #ifdef min
00778 #undef min
00779 #endif
00780 template<class TYPE>
00781 static inline TYPE min(TYPE a,TYPE b) { return (a<b)?a:b; }
00782
00783
00784
00785
00786 static bool
00787 make_child_layer(
00788 DjVuTXT::Zone &parent,
00789 const lt_XMLTags &tag, ByteStream &bs,
00790 const int height, const double ws, const double hs)
00791 {
00792 bool retval=true;
00793
00794
00795
00796
00797 DjVuTXT::Zone *self_ptr;
00798 char sepchar;
00799 const GUTF8String name(tag.get_name());
00800 if(name == wordtag)
00801 {
00802 self_ptr=parent.append_child();
00803 self_ptr->ztype = DjVuTXT::WORD;
00804 sepchar=' ';
00805 }else if(name == linetag)
00806 {
00807 self_ptr=parent.append_child();
00808 self_ptr->ztype = DjVuTXT::LINE;
00809 sepchar=DjVuTXT::end_of_line;
00810 }else if(name == paragraphtag)
00811 {
00812 self_ptr=parent.append_child();
00813 self_ptr->ztype = DjVuTXT::PARAGRAPH;
00814 sepchar=DjVuTXT::end_of_paragraph;
00815 }else if(name == regiontag)
00816 {
00817 self_ptr=parent.append_child();
00818 self_ptr->ztype = DjVuTXT::REGION;
00819 sepchar=DjVuTXT::end_of_region;
00820 }else if(name == pagecolumntag)
00821 {
00822 self_ptr=parent.append_child();
00823 self_ptr->ztype = DjVuTXT::COLUMN;
00824 sepchar=DjVuTXT::end_of_column;
00825 }else
00826 {
00827 self_ptr = &parent;
00828 self_ptr->ztype = DjVuTXT::PAGE;
00829 sepchar=0;
00830 }
00831 DjVuTXT::Zone &self = *self_ptr;
00832 self.text_start = bs.tell();
00833 int &xmin=self.rect.xmin, &ymin=self.rect.ymin,
00834 &xmax=self.rect.xmax, &ymax=self.rect.ymax;
00835 GRect default_rect;
00836 default_rect.xmin=max(parent.rect.xmax,parent.rect.xmin);
00837 default_rect.xmax=min(parent.rect.xmax,parent.rect.xmin);
00838 default_rect.ymin=max(parent.rect.ymax,parent.rect.ymin);
00839 default_rect.ymax=min(parent.rect.ymax,parent.rect.ymin);
00840
00841 GPosition pos(tag.get_args().contains("coords"));
00842 if(pos)
00843 {
00844 GList<int> rectArgs;
00845 intList(tag.get_args()[pos], rectArgs);
00846 if((pos=rectArgs))
00847 {
00848 xmin=(int)(ws*(double)rectArgs[pos]);
00849 if(++pos)
00850 {
00851 ymin=(height-1)-(int)(hs*(double)rectArgs[pos]);
00852 if(++pos)
00853 {
00854 xmax=(int)(ws*(double)rectArgs[pos]);
00855 if(++pos)
00856 {
00857 ymax=(height-1)-(int)(hs*(double)rectArgs[pos]);
00858 if(xmin>xmax)
00859 {
00860 const int t=xmin;
00861 xmin=xmax;
00862 xmax=t;
00863 }
00864 if(ymin>ymax)
00865 {
00866 const int t=ymin;
00867 ymin=ymax;
00868 ymax=t;
00869 }
00870 }
00871 }
00872 }
00873 }
00874 }
00875 if(self.ztype == DjVuTXT::WORD)
00876 {
00877 if(! pos)
00878 {
00879 self.rect=default_rect;
00880 retval=false;
00881 }
00882 const GUTF8String raw(tag.get_raw().fromEscaped());
00883 const int i=raw.nextNonSpace(0);
00884 bs.writestring(raw.substr(i,raw.firstEndSpace(i)-i));
00885 if(sepchar)
00886 bs.write8(sepchar);
00887 self.text_length = bs.tell() - self.text_start;
00888 }else if(pos)
00889 {
00890 pos=tag.get_content();
00891 if(pos)
00892 {
00893 for(pos=tag.get_content(); pos; ++pos)
00894 {
00895 const GP<lt_XMLTags> t(tag.get_content()[pos].tag);
00896 make_child_layer(self, *t, bs, height,ws,hs);
00897 }
00898 if(sepchar)
00899 bs.write8(sepchar);
00900 self.text_length = bs.tell() - self.text_start;
00901 }else
00902 {
00903 const GUTF8String raw(tag.get_raw().fromEscaped());
00904 const int i=raw.nextNonSpace(0);
00905 bs.writestring(raw.substr(i,raw.firstEndSpace(i)-i));
00906 if(sepchar)
00907 bs.write8(sepchar);
00908 self.text_length = bs.tell() - self.text_start;
00909 }
00910 }else
00911 {
00912 self.rect=default_rect;
00913 if((pos=tag.get_content()))
00914 {
00915 do
00916 {
00917 const GP<lt_XMLTags> t(tag.get_content()[pos].tag);
00918 const GRect save_rect(self.rect);
00919 self.rect=default_rect;
00920 if(retval=make_child_layer(self, *t, bs, height,ws,hs))
00921 {
00922 xmin=min(save_rect.xmin,xmin);
00923 xmax=max(save_rect.xmax,xmax);
00924 ymin=min(save_rect.ymin,ymin);
00925 ymax=max(save_rect.ymax,ymax);
00926 }else
00927 {
00928
00929
00930 xmin=min(save_rect.xmin,default_rect.xmax);
00931 xmax=max(save_rect.xmax,default_rect.xmin);
00932 ymin=min(save_rect.ymin,default_rect.ymax);
00933 ymax=max(save_rect.ymax,default_rect.ymin);
00934 for(; pos; ++pos)
00935 {
00936 const GP<lt_XMLTags> t(tag.get_content()[pos].tag);
00937 make_child_layer(self, *t, bs, height,ws,hs);
00938 }
00939 break;
00940 }
00941 } while(++pos);
00942 if(sepchar)
00943 bs.write8(sepchar);
00944 self.text_length = bs.tell() - self.text_start;
00945 }else
00946 {
00947 const GUTF8String raw(tag.get_raw().fromEscaped());
00948 const int i=raw.nextNonSpace(0);
00949 bs.writestring(raw.substr(i,raw.firstEndSpace(i)-i));
00950 if(sepchar)
00951 bs.write8(sepchar);
00952 self.text_length = bs.tell() - self.text_start;
00953 }
00954 }
00955 parent.rect.xmin=min(xmin,parent.rect.xmin);
00956 parent.rect.ymin=min(ymin,parent.rect.ymin);
00957 parent.rect.xmax=max(xmax,parent.rect.xmax);
00958 parent.rect.ymax=max(ymax,parent.rect.ymax);
00959 if(xmin>xmax)
00960 {
00961 const int t=xmin;
00962 xmin=xmax;
00963 xmax=t;
00964 }
00965 if(ymin>ymax)
00966 {
00967 const int t=ymin;
00968 ymin=ymax;
00969 ymax=t;
00970 }
00971
00972
00973 return retval;
00974 }
00975
00976 void
00977 lt_XMLParser::Impl::ChangeTextOCR(
00978 const GUTF8String &value,
00979 const int width,
00980 const int height,
00981 const GP<DjVuFile> &dfile)
00982 {
00983 if(value.length() && value.downcase() != "false")
00984 {
00985 const GP<ByteStream> bs=OCRcallback(value,DjVuImage::create(dfile));
00986 if( bs && bs->size() )
00987 {
00988 const GP<lt_XMLTags> tags(lt_XMLTags::create(bs));
00989 ChangeText(width,height,*dfile,*tags);
00990 }
00991 }
00992 }
00993
00994 void
00995 lt_XMLParser::Impl::ChangeMeta(
00996 DjVuFile &dfile, const lt_XMLTags &tags )
00997 {
00998 dfile.resume_decode(true);
00999 GP<ByteStream> gbs(ByteStream::create());
01000 tags.write(*gbs,false);
01001 gbs->seek(0L);
01002 GUTF8String raw(gbs->getAsUTF8());
01003 if(raw.length())
01004 {
01005
01006 dfile.change_meta(raw+"\n");
01007 }else
01008 {
01009 dfile.change_meta(GUTF8String());
01010 }
01011 }
01012
01013 void
01014 lt_XMLParser::Impl::ChangeText(
01015 const int width, const int height,
01016 DjVuFile &dfile, const lt_XMLTags &tags )
01017 {
01018 dfile.resume_decode(true);
01019
01020 GP<DjVuText> text = DjVuText::create();
01021 GP<DjVuTXT> txt = text->txt = DjVuTXT::create();
01022
01023
01024 GP<ByteStream> textbs = ByteStream::create();
01025
01026 GP<DjVuInfo> info=(dfile.info);
01027 if(info)
01028 {
01029 const int h=info->height;
01030 const int w=info->width;
01031 txt->page_zone.text_start = 0;
01032 DjVuTXT::Zone &parent=txt->page_zone;
01033 parent.rect.xmin=0;
01034 parent.rect.ymin=0;
01035 parent.rect.ymax=h;
01036 parent.rect.xmax=w;
01037 double ws=1.0;
01038 if(width && width != w)
01039 {
01040 ws=((double)w)/((double)width);
01041 }
01042 double hs=1.0;
01043 if(height && height != h)
01044 {
01045 hs=((double)h)/((double)height);
01046 }
01047 make_child_layer(parent, tags, *textbs, h, ws,hs);
01048 textbs->write8(0);
01049 long len = textbs->tell();
01050 txt->page_zone.text_length = len;
01051 textbs->seek(0,SEEK_SET);
01052 textbs->read(txt->textUTF8.getbuf(len), len);
01053
01054 dfile.change_text(txt,false);
01055 }
01056 }
01057
01058 void
01059 lt_XMLParser::Impl::parse_text(
01060 const int width,
01061 const int height,
01062 const lt_XMLTags &GObject,
01063 DjVuFile &dfile )
01064 {
01065 GPosition textPos = GObject.contains(hiddentexttag);
01066 if(textPos)
01067 {
01068
01069
01070 GPList<lt_XMLTags> textTags = GObject[textPos];
01071 GPosition pos = textTags;
01072 ChangeText(width,height,dfile,*textTags[pos]);
01073 }
01074 }
01075
01076 void
01077 lt_XMLParser::Impl::parse_meta(
01078 const lt_XMLTags &GObject,
01079 DjVuFile &dfile )
01080 {
01081 GPosition metaPos = GObject.contains(metadatatag);
01082 if(metaPos)
01083 {
01084
01085
01086 GPList<lt_XMLTags> metaTags = GObject[metaPos];
01087 GPosition pos = metaTags;
01088 ChangeMeta(dfile,*metaTags[pos]);
01089 }
01090 }
01091
01092 static GP<ByteStream>
01093 OCRcallback(
01094 void * const xarg,
01095 lt_XMLParser::mapOCRcallback * const xcallback,
01096 const GUTF8String &value,
01097 const GP<DjVuImage> &image )
01098 {
01099 GP<ByteStream> retval;
01100 static void *arg=0;
01101 static lt_XMLParser::mapOCRcallback *callback=0;
01102 if(image)
01103 {
01104 if(callback)
01105 retval=callback(arg,value,image);
01106 }else
01107 {
01108 arg=xarg;
01109 callback=xcallback;
01110 }
01111 return retval;
01112 }
01113
01114 void
01115 lt_XMLParser::setOCRcallback(
01116 void * const arg,
01117 mapOCRcallback * const callback)
01118 {
01119 ::OCRcallback(arg,callback);
01120 }
01121
01122
01123 #ifdef HAVE_NAMESPACES
01124 }
01125 # ifndef NOT_USING_DJVU_NAMESPACE
01126 using namespace DJVU;
01127 # endif
01128 #endif