kpilot

makedoc9.cc

00001 // based on: MakeDoc, version 2
00002 // I only took the tBuf class from there and adapted it.
00003 //
00004 // Compresses text files into a format that is ready to export to a Pilot
00005 // and work with Rick Bram's PilotDOC reader.
00006 // Copyright (C) Reinhold Kainhofer, 2002
00007 // Copyrigth (C) Pat Beirne, 2000
00008 //
00009 // Original file (makedoc9.cpp) copyright by:
00010 // Copyright (C) Pat Beirne, 2000.
00011 // Distributable under the GNU General Public License Version 2 or later.
00012 //
00013 // ver 0.6 enforce 31 char limit on database names
00014 // ver 0.7 change header and record0 to structs
00015 // ver 2.0 added category control on the command line
00016 //              changed extensions from .prc to .pdb
00017 
00018 /*
00019 ** This program is free software; you can redistribute it and/or modify
00020 ** it under the terms of the GNU General Public License as published by
00021 ** the Free Software Foundation; either version 2 of the License, or
00022 ** (at your option) any later version.
00023 **
00024 ** This program is distributed in the hope that it will be useful,
00025 ** but WITHOUT ANY WARRANTY; without even the implied warranty of
00026 ** MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
00027 ** GNU General Public License for more details.
00028 **
00029 ** You should have received a copy of the GNU General Public License
00030 ** along with this program in a file called COPYING; if not, write to
00031 ** the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston,
00032 ** MA 02110-1301, USA.
00033 */
00034 
00035 
00036 #include <stdio.h>
00037 #include <stdlib.h>
00038 #include <string.h>
00039 
00040 #include <iostream>
00041 
00042 
00043 #include "makedoc9.h"
00044 
00045 
00046 
00047 //
00048 // Issue()
00049 //
00050 // action: handle the details of writing a single
00051 //              character to the compressed stream
00052 //
00053 unsigned
00054  tBuf::Issue(byte src, int &bSpace)
00055 {
00056     unsigned int iDest = len;
00057     byte *dest = buf;
00058 
00059     // TODO: which of the if parts should really be included???
00060 #if 0
00061     // modified version of issue
00062     // just issue the char
00063     if (src >= 0x80 || src <= 8)
00064         dest[iDest++] = 1;
00065     dest[iDest++] = src;
00066 
00067 #else
00068     // if there is an outstanding space char, see if
00069     // we can squeeze it in with an ASCII char
00070     if (bSpace)
00071     {
00072         if (src >= 0x40 && src <= 0x7F)
00073             dest[iDest++] = src ^ 0x80;
00074         else
00075         {
00076             // couldn't squeeze it in, so issue the space char by itself
00077             // most chars go out simple, except the range 1...8,0x80...0xFF
00078             dest[iDest++] = ' ';
00079             if (src < 0x80 && (src == 0 || src > 8))
00080                 dest[iDest++] = src;
00081             else
00082                 dest[iDest++] = 1, dest[iDest++] = src;
00083         }
00084         // knock down the space flag
00085         bSpace = 0;
00086     }
00087     else
00088     {
00089         // check for a space char
00090         if (src == ' ')
00091             bSpace = 1;
00092         else
00093         {
00094             if (src < 0x80 && (src == 0 || src > 8))
00095                 dest[iDest++] = src;
00096             else
00097                 dest[iDest++] = 1, dest[iDest++] = src;
00098 
00099         }
00100     }
00101 #endif
00102     len = iDest;
00103     return iDest;
00104 }
00105 
00106 //
00107 // Compress
00108 //
00109 // params:      none
00110 //
00111 // action:      takes the given buffer,
00112 //                                      and compresses
00113 //                                      the original data down into a second buffer
00114 //
00115 // comment:     This version make heavy use of walking pointers.
00116 //
00117 unsigned tBuf::Compress()
00118 {
00119     if (!buf)
00120         return 0;
00121     if (isCompressed) {
00122 //      cout<<"Buffer is already compressed!"<<endl;
00123         return len;
00124 //  } else {
00125 //      cout<<" Compressing buffer!!!"<<endl;
00126     }
00127 
00128     unsigned int i;
00129 
00130     // run through the input buffer
00131     byte *pBuffer;                    // points to the input buffer
00132     byte *pHit;                       // points to a walking test hit; works upwards on successive matches
00133     byte *pPrevHit;               // previous value of pHit; also, start of next test
00134     byte *pTestHead;                  // current test string
00135     byte *pTestTail;                  // current walking pointer; one past the current test buffer
00136     byte *pEnd;                       // 1 past the end of the input buffer
00137 
00138     pHit = pPrevHit = pTestHead = pBuffer = buf;
00139     pTestTail = pTestHead + 1;
00140     pEnd = buf + len;                 // should point to a 0!
00141 
00142     // make a dest buffer and reassign the local buffer
00143     buf = new byte[6000];
00144     len = 0;                              // used to walk through the output buffer
00145 
00146     // loop, absorbing one more char from the input buffer on each pass
00147     for (; pTestHead != pEnd; pTestTail++)
00148     {
00149         // if we already have 10 char match, don't bother scanning again for the 11th (wasted time)
00150         if (pTestTail - pTestHead != (1 << COUNT_BITS) + 3)
00151         {
00152             // scan in the previous data for a match
00153             // terminate the test string (and the matcher string, as well!) in a 0
00154             byte tmp = *pTestTail;
00155 
00156             *pTestTail = 0;
00157             pHit = (byte *) strstr((const char *) pPrevHit,
00158                 (const char *) pTestHead);
00159             *pTestTail = tmp;         // restore the char
00160         }
00161 
00162         // on a mismatch or end of buffer, issued codes
00163         if (pHit == pTestHead
00164             || pTestTail - pTestHead > (1 << COUNT_BITS) + 2
00165             || pTestTail == pEnd)
00166         {
00167             // issue the codes
00168             // first, check for short runs
00169             if (pTestTail - pTestHead < 4)
00170             {
00171                 if (pTestHead[0] > 0x7F || pTestHead[0] <= 8)
00172                     buf[len++] = 1;
00173                 buf[len++] = pTestHead[0];
00174                 pTestHead++;
00175             }
00176             // for longer runs, issue a run-code
00177             else
00178             {
00179                 unsigned int dist = pTestHead - pPrevHit;
00180                 unsigned int compound =
00181                     (dist << COUNT_BITS) + pTestTail - pTestHead - 4;
00182 
00183 //if (dist>=(1<<DISP_BITS)) printf("\n!! error dist overflow");
00184 //if (pTestTail-pTestHead-4>7) printf("\n!! error len overflow");
00185 
00186                 buf[len++] = 0x80 + (compound >> 8);
00187                 buf[len++] = compound & 0xFF;
00188 //printf("\nissuing code for sequence len %d <%c%c%c>",pTestTail-pTestHead-1,pTestHead[0],pTestHead[1],pTestHead[2]);
00189 //printf("\n          <%x%x>",pOut[-2],pOut[-1]);
00190                 // and start again
00191                 pTestHead = pTestTail - 1;
00192             }
00193             // start the search again
00194             pPrevHit = pBuffer;
00195             // within range
00196             if (pTestHead - pPrevHit > ((1 << DISP_BITS) - 1))
00197                 pPrevHit = pTestHead - ((1 << DISP_BITS) - 1);
00198         }
00199         // got a match
00200         else
00201         {
00202             pPrevHit = pHit;
00203         }
00204         // when we get to the end of the buffer, don't inc past the end
00205         // this forces the residue chars out one at a time
00206         if (pTestTail == pEnd)
00207             pTestTail--;
00208     }
00209 
00210 
00211     // final scan to merge consecutive high chars together
00212     // and merge space chars
00213     unsigned int k;
00214 
00215     for (i = k = 0; i < len; i++, k++)
00216     {
00217         buf[k] = buf[i];
00218         // skip the run-length codes
00219         if (buf[k] >= 0x80 && buf[k] < 0xC0)
00220             buf[++k] = buf[++i];
00221         // if we hit a high char marker, look ahead for another
00222         // and merge multiples together
00223         else if (buf[k] == 1)
00224         {
00225             buf[k + 1] = buf[i + 1];
00226             while (i + 2 < len && buf[i + 2] == 1 && buf[k] < 8)
00227             {
00228                 buf[k]++;
00229                 buf[k + buf[k]] = buf[i + 3];
00230                 i += 2;
00231             }
00232             k += buf[k];
00233             i++;
00234         }
00235         else if (buf[k] == ' ' && i < len - 1 && buf[i + 1] <= 0x7F
00236             && buf[i + 1] >= 0x40)
00237             buf[k] = 0x80 | buf[++i];
00238     }
00239 
00240     // delete original buffer
00241     delete[]pBuffer;
00242     len = k;
00243 
00244     isCompressed = true;
00245     return k;
00246 }
00247 
00248 /*
00249     Decompress
00250 
00251     params: none
00252 
00253     action: make a new buffer
00254                     run through the source data
00255                     check the 4 cases:
00256                         0,9...7F represent self
00257                         1...8       escape n chars
00258                         80...bf reference earlier run
00259                         c0...ff space+ASCII
00260 
00261 */
00262 unsigned tBuf::Decompress()
00263 {
00264     if (!buf)
00265         return 0;
00266     if (!isCompressed) {
00267 //      cout<<"Buffer already uncompressed. Doing nothing"<<endl;
00268         return len;
00269 //  } else {
00270 //      cout<<"Decompressing buffer"<<endl;
00271     }
00272 
00273     // we "know" that all decompresses fit within 4096, right?
00274     byte *pOut = new byte[6000];
00275     byte *in_buf = buf;
00276     byte *out_buf = pOut;
00277 
00278     unsigned int i, j;
00279 
00280     for (j = i = 0; j < len;)
00281     {
00282         unsigned int c;
00283 
00284         // take a char from the input buffer
00285         c = in_buf[j++];
00286 
00287         // separate the char into zones: 0, 1...8, 9...0x7F, 0x80...0xBF, 0xC0...0xFF
00288 
00289         // codes 1...8 mean copy that many bytes; for accented chars & binary
00290         if (c > 0 && c < 9)
00291             while (c--)
00292                 out_buf[i++] = in_buf[j++];
00293 
00294         // codes 0, 9...0x7F represent themselves
00295         else if (c < 0x80)
00296             out_buf[i++] = c;
00297 
00298         // codes 0xC0...0xFF represent "space + ascii char"
00299         else if (c >= 0xC0)
00300             out_buf[i++] = ' ', out_buf[i++] = c ^ 0x80;
00301 
00302         // codes 0x80...0xBf represent sequences
00303         else
00304         {
00305             int m, n;
00306 
00307             c <<= 8;
00308             c += in_buf[j++];
00309             m = (c & 0x3FFF) >> COUNT_BITS;
00310             n = c & ((1 << COUNT_BITS) - 1);
00311             n += 3;
00312             while (n--)
00313             {
00314                 out_buf[i] = out_buf[i - m];
00315                 i++;
00316             }
00317         }
00318     }
00319     out_buf[i++]='\0';
00320     out_buf[i++]='\0';
00321     delete[]buf;
00322     buf = pOut;
00323     len = i;
00324 
00325     isCompressed = false;
00326     return i;
00327 }
00328 
00329 unsigned tBuf::DuplicateCR()
00330 {
00331     if (!buf)
00332         return 0;
00333     byte *pBuf = new byte[2 * len];
00334 
00335     unsigned int k, j;
00336 
00337     for (j = k = 0; j < len; j++, k++)
00338     {
00339         pBuf[k] = buf[j];
00340         if (pBuf[k] == 0x0A)
00341             pBuf[k++] = 0x0D, pBuf[k] = 0x0A;
00342     }
00343     delete[]buf;
00344     buf = pBuf;
00345     len = k;
00346     return k;
00347 }
00348 
00349 
00350 
00351 // this nasty little beast removes really low ASCII and 0's
00352 // and handles the CR problem
00353 //
00354 // if a cr appears before a lf, then remove the cr
00355 // if a cr appears in isolation, change to a lf
00356 unsigned tBuf::RemoveBinary()
00357 {
00358     if (!buf)
00359         return 0;
00360     byte *in_buf = buf;
00361     byte *out_buf = new byte[len];
00362 
00363     unsigned int k, j;
00364 
00365     for (j = k = 0; j < len; j++, k++)
00366     {
00367         // copy each byte
00368         out_buf[k] = in_buf[j];
00369 
00370         // throw away really low ASCII
00371         if (( /*out_buf[k]>=0 && */ out_buf[k] < 9))
00372             k--;
00373 
00374         // for CR
00375         if (out_buf[k] == 0x0D)
00376         {
00377             // if next is LF, then drop it
00378             if (j < len - 1 && in_buf[j + 1] == 0x0A)
00379                 k--;
00380             else                          // turn it into a LF
00381                 out_buf[k] = 0x0A;
00382         }
00383     }
00384     delete[]buf;
00385     buf = out_buf;
00386     len = k;
00387     return k;
00388 }
00389 
00390 void tBuf::setText(const byte * text, unsigned txtlen, bool txtcomp)
00391 {
00392     if (buf)
00393         delete[]buf;
00394     buf = 0L;
00395 
00396     if (txtlen <= 0)
00397         txtlen = strlen((const char *) text);
00398     len = txtlen;
00399     buf = new byte[len];
00400 
00401     memcpy(buf, text, len*sizeof(char));
00402 //  strncpy((char *) buf, (const char *) text, len);
00403     isCompressed = txtcomp;
00404 //  cout<<"Setting text, compressed="<<txtcomp<<endl;
00405 }
kpilot

makedoc9.cc

kpilot

API Reference