filters

FilterPage.cpp

00001 /*
00002  * Copyright (c) 2002-2003 Nicolas HADACEK (hadacek@kde.org)
00003  *
00004  * This program is free software; you can redistribute it and/or modify
00005  * it under the terms of the GNU General Public License as published by
00006  * the Free Software Foundation; either version 2 of the License, or
00007  * (at your option) any later version.
00008 
00009  * This program is distributed in the hope that it will be useful,
00010  * but WITHOUT ANY WARRANTY; without even the implied warranty of
00011  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
00012  * GNU General Public License for more details.
00013 
00014  * You should have received a copy of the GNU General Public License
00015  * along with this program; if not, write to the Free Software
00016  * Foundation, Inc., 51 Franklin Street, Fifth Floor,
00017  * Boston, MA 02110-1301, USA.
00018  */
00019 
00020 #include "FilterPage.h"
00021 
00022 #include <math.h>
00023 
00024 #include <kglobal.h>
00025 #include <kdebug.h>
00026 
00027 #include "data.h"
00028 #include "transform.h"
00029 #include "dialog.h"
00030 
00031 #define TIME_START(str) { \
00032     kdDebug(30516) << str << endl; \
00033     _time.restart(); \
00034 }
00035 #define TIME_END { kdDebug(30516) << "elapsed=" << _time.elapsed() << endl; }
00036 
00037 
00038 namespace PDFImport
00039 {
00040 
00041 //-----------------------------------------------------------------------------
00042 Page::Page(Data &data)
00043     : TextPage(false), _data(data), _lastStr(0), _rects(Nb_ParagraphTypes)
00044 {
00045     _links.setAutoDelete(true);
00046 }
00047 
00048 void Page::clear()
00049 {
00050     TextPage::clear();
00051     _lastStr = 0;
00052     _links.clear();
00053     _pars.clear();
00054 }
00055 
00056 void Page::beginString(GfxState *state, double x0, double y0)
00057 {
00058     // This check is needed because Type 3 characters can contain
00059     // text-drawing operations.
00060     if (curStr) {
00061         ++nest;
00062         return;
00063     }
00064 
00065 //    _data.checkTextFrameset();
00066     curStr = new String(state, x0, y0, fontSize, _data.textIndex());
00067 //    kdDebug(30516) << "---" << endl;
00068 }
00069 
00070 void Page::endString()
00071 {
00072 //    kdDebug(30516) << "endString..." << " len=" << curStr->len
00073 //                   << " " << _lastStr
00074 //                   << " len=" << (_lastStr ? _lastStr->len : -1) << endl;
00075     TextPage::endString();
00076 //    kdDebug(30516) << "  ...endString done" << endl;
00077 }
00078 
00079 void Page::addString(TextString *str)
00080 {
00081 //    kdDebug(30516) << "addString..." << endl;
00082 //    if ( str->len==0 ) kdDebug(30516) << "empty string !" << endl;
00083     if (_lastStr) _lastStr->checkCombination(str);
00084     _lastStr = (str->len==0 ? 0 : static_cast<String *>(str));
00085 //    QString s;
00086 //    for (int i=0; i<str->len; i++) s += QChar(str->text[i]);
00087 //    kdDebug(30516) << "string: " << s << " ("
00088 //                   << (str->len>0 ? s[0].unicode() : 0) << ")" << endl;
00089     TextPage::addString(str);
00090 //    kdDebug(30516) << " ...addString done" << endl;
00091 }
00092 
00093 TextBlock *Page::block(TextLine *line, int index)
00094 {
00095     uint k = 0;
00096     if ( index<0 )
00097         for (TextBlock *block = line->blocks; block; block = block->next) k++;
00098     k += index;
00099     uint i = 0;
00100     for (TextBlock *block = line->blocks; block; block = block->next) {
00101         if ( i==k ) return block;
00102         i++;
00103     }
00104     return 0;
00105 }
00106 
00107 //-----------------------------------------------------------------------------
00108 bool Page::isLastParagraphLine(TextLine *line, const Paragraph &par)
00109 {
00110     // last line of page
00111     if ( line->next==0 ) return true;
00112     double dy = line->next->yMin - line->yMax;
00113     double ndy = line->next->yMax - line->next->yMin;
00114     String *str = static_cast<String *>(line->blocks->strings);
00115     String *nStr = static_cast<String *>(line->next->blocks->strings);
00116     // next line far below
00117     if ( dy>0.5*ndy ) return true;
00118     // image displayed before next line (?)
00119     if ( str->frameIndex()!=nStr->frameIndex() ) return true;
00120     if ( line->blocks==0 ) return false; // should not happen
00121     // if contains one or more inside tabs
00122     if (line->blocks->next) return true;
00123     if ( line->next && line->next->blocks==0 ) return false;//should not happen
00124     // if next line contains one or more inside tabs
00125     if ( line->next && line->next->blocks->next ) return true;
00126     TextBlock *b = block(line, -1);
00127     if ( b==0 || b->len==0 ) return false; // should not happen
00128     QChar c = QChar(b->text[b->len-1]);
00129     // last line char is not '.' or ':'
00130     if ( c!='.' && c!=':' ) return false;
00131     // if at line end and block aligned : same paragraph
00132     return ( !equal(b->xMax, par.rect().right()) );
00133 }
00134 
00135 void Page::createParagraphs()
00136 {
00137     TextLine *first = lines;
00138     uint nbLines = 0;
00139     for (TextLine *line = lines; line; line = line->next) {
00140         nbLines++;
00141         Paragraph par(first, nbLines);
00142         if ( isLastParagraphLine(line, par) ) {
00143             _pars.push_back(par);
00144             nbLines = 0;
00145             first = line->next;
00146         }
00147     }
00148 }
00149 
00150 void Page::checkHeader()
00151 {
00152     uint s = _pars.size();
00153     if ( s==0 ) return;
00154     Paragraph &par = _pars[0];
00155     if ( par.lines().count()!=1 ) return;
00156     const TextLine *first = par.lines().first();
00157     const TextLine *second = (s>1 ? _pars[1].lines().first() : 0);
00158     double limit = 0.2 * _data.pageRect().height();
00159     double delta = 2 * kMin(first->yMax - first->yMin, 12.0);
00160 //    kdDebug(30516) << "first: " << first->yMax << " (" << limit << ")" << endl;
00161 //    if (second) kdDebug(30516) << "second: " << second->yMin << " "
00162 //                               << second->yMin-first->yMax << " (" << delta
00163 //                               << ")" << endl;
00164     if ( first->yMax>limit ) return;
00165     if ( second && (second->yMin-first->yMax)<delta ) return;
00166     par.type = Header;
00167     _rects[Header] = par.rect();
00168 }
00169 
00170 bool Page::hasHeader() const
00171 {
00172     return (_pars.size()>0 ? _pars[0].type==Header : false);
00173 }
00174 
00175 void Page::checkFooter()
00176 {
00177     uint s = _pars.size();
00178     if ( s==0 ) return;
00179     Paragraph &par = _pars[s-1];
00180     if ( par.lines().count()!=1 ) return;
00181     const TextLine *last = par.lines().first();
00182     const TextLine *blast = (s>1 ? _pars[s-2].lines().last() : 0);
00183     double limit = 0.8 * _data.pageRect().height();
00184     double delta = 2 * kMin(last->yMax-last->yMin, 12.0);
00185 //    kdDebug(30516) << "last: " << last->yMax << " (" << limit << ")" << endl;
00186 //    if (blast) kdDebug(30516) << "blast: " << blast->yMin << " "
00187 //                              <<  last->yMin-blast->yMax << " (" << delta
00188 //                              << ")" << endl;
00189     if ( last->yMin<limit ) return;
00190     if ( blast && (last->yMin-blast->yMax)<delta ) return;
00191     par.type = Footer;
00192     _rects[Footer] = par.rect();
00193 }
00194 
00195 bool Page::hasFooter() const
00196 {
00197     return (_pars.size()>0 ? _pars[_pars.size()-1].type==Footer
00198             : false);
00199 }
00200 
00201 void Page::endPage()
00202 {
00203     TIME_START("coalesce strings");
00204     TextPage::coalesce();
00205     TIME_END;
00206 
00207     createParagraphs();
00208 
00209     // check header and footer
00210     checkHeader();
00211 //    if ( hasHeader() ) kdDebug(30516) << "has header" << endl;
00212     checkFooter();
00213 //    if ( hasFooter() ) kdDebug(30516) << "has footer" << endl;
00214 
00215     // compute body rect
00216     uint begin = (hasHeader() ? 1 : 0);
00217     uint end = _pars.size() - (hasFooter() ? 1 : 0);
00218     for (uint i=begin; i<end; i++)
00219         _rects[Body].unite(_pars[i].rect());
00220 }
00221 
00222 
00223 //-----------------------------------------------------------------------------
00224 void Page::initParagraph(Paragraph &par) const
00225 {
00226     bool rightAligned = true, centered = true, leftAligned = true;
00227     const double pleft = _rects[par.type].left();
00228     const double pright = _rects[par.type].right();
00229     const double pmean = (pleft + pright) / 2;
00230 
00231     QValueList<TextLine *>::const_iterator it;
00232     for (it = par.lines().begin(); it!=par.lines().end(); ++it) {
00233 
00234         // compute tabulations
00235         Tabulator tab;
00236         for (TextBlock *blk = (*it)->blocks; blk; blk = blk->next) {
00237             // if tabulated text is aligned on right edge: put a tab
00238             // on right edge and the tab type will be right aligned...
00239             double tabRightAligned = equal(blk->xMax, pright);
00240             double dx = (tabRightAligned ? pright : blk->xMin) - pleft;
00241             // #### if the tab is just at the frame edge:
00242             // the text is sent to next line ???
00243             if (tabRightAligned) dx -= 0.1;
00244             int res = par.findTab(dx, *it);
00245             if ( res==-1 ) {
00246                 tab.pos = dx;
00247                 if (tabRightAligned) {
00248                     tab.alignment = Tabulator::Right;
00249                     kdDebug(30516) << "tabulated text right aligned.." << endl;
00250                 } else tab.alignment = Tabulator::Left;
00251                 par.tabs.push_back(tab);
00252             }
00253         }
00254         qHeapSort2(par.tabs);
00255 
00256         // compute indents
00257         double left = (*it)->blocks->xMin - pleft;
00258         if ( par.isFirst(*it) ) {
00259             par.firstIndent = left;
00260             par.leftIndent = left;
00261         } else if ( par.isSecond(*it) ) par.leftIndent = left;
00262         else par.leftIndent = kMin(par.leftIndent, left);
00263     }
00264 
00265     // compute alignment
00266     for (it = par.lines().begin(); it!=par.lines().end(); ++it) {
00267         double left = (*it)->blocks->xMin;
00268         double right = block(*it, -1)->xMax;
00269         double mean = (left + right) / 2;
00270 //        QString text;
00271 //        for (int i=0; i<kMin(4, (*it)->blocks->len); i++)
00272 //            text += QChar((*it)->blocks->text[i]);
00273 //        kdDebug(30516) << text << " left=" << left
00274 //                       << " pleft=" << pleft + par.leftIndent
00275 //                       << " indent=" << par.leftIndent
00276 //                       << " findent=" << par.firstIndent << endl;
00277         if ( centered && !equal(mean, pmean) ) centered = false;
00278         if ( leftAligned && (!par.isFirst(*it) || par.hasOneLine())
00279              && !equal(left, pleft + par.leftIndent, 0.05) ) {
00280             kdDebug(30516) << "not left aligned" << endl;
00281             leftAligned = false;
00282         }
00283         if ( rightAligned && (!par.isLast(*it) || par.hasOneLine())
00284              && !equal(right, pright, 0.05) ) {
00285             kdDebug(30516) << "not right aligned" << endl;
00286             rightAligned = false;
00287         }
00288     }
00289 
00290     // finalize alignment
00291     if (rightAligned) par.align = (leftAligned ? AlignBlock : AlignRight);
00292     else if (centered) par.align = AlignCenter;
00293 }
00294 
00295 void Page::fillParagraph(Paragraph &par, double &offset) const
00296 {
00297     const double pleft = _rects[par.type].left();
00298     const double pright = _rects[par.type].right();
00299     par.offset = par.lines().first()->yMin - offset;
00300 //    kdDebug(30516) << "offset=" << offset
00301 //                   << " yMin=" << par.lines().first()->yMin
00302 //                   << " paroffset=" << par.offset << endl;
00303     if ( par.offset>0 ) offset += par.offset;
00304 
00305     QValueList<TextLine *>::const_iterator it;
00306     for (it = par.lines().begin(); it!=par.lines().end(); ++it) {
00307         // end of previous line (inside a paragraph)
00308         if ( !par.isFirst(*it) ) {
00309             bool hyphen = false;
00310             if (_data.options().smart) {
00311                 // check hyphen
00312                 uint bi, pbi;
00313                 int si = par.charFromEnd(0, bi);
00314                 Q_ASSERT( si>=0 );
00315                 QChar c = par.blocks[bi].text[si];
00316                 int psi = par.charFromEnd(1, pbi);
00317                 QChar prev = (psi<0 ? QChar::null : par.blocks[pbi].text[psi]);
00318                 if ( !prev.isNull() && type(c.unicode())==Hyphen )
00319                     kdDebug(30516) << "hyphen ? " << QString(prev)
00320                                    << " type=" << type(prev.unicode())
00321                                    << endl;
00322                 TextString *next =
00323                     ((*it)->next ? (*it)->next->blocks->strings : 0);
00324                 if ( !prev.isNull() && type(c.unicode())==Hyphen
00325                      && isLetter( type(prev.unicode()) )
00326                      && next && next->len>0
00327                      && isLetter( type(next->text[next->len-1]) ) ) {
00328                     kdDebug(30516) << "found hyphen" << endl;
00329                     hyphen = true;
00330                     par.blocks[bi].text.remove(si, 1);
00331                 }
00332             }
00333             if ( !hyphen ) {
00334                 Block b;
00335                 bool remove = _data.options().smart;
00336                 if ( remove && par.align!=AlignBlock )
00337                     remove = ( par.rect().right()>0.9*pright );
00338                 b.text = (remove ? ' ' : '\n');
00339                 b.font = static_cast<String *>((*it)->blocks->strings)->font();
00340                 par.blocks.push_back(b);
00341             }
00342         }
00343 
00344         int lineHeight = 0;
00345         TextBlock *prevBlk = 0;
00346         for (TextBlock *blk = (*it)->blocks; blk; blk = blk->next) {
00347 
00348             // tabulations
00349             double tabRightAligned = equal(blk->xMax, pright);
00350             double dx = (tabRightAligned ? pright : blk->xMin) - pleft;
00351             int res = par.findTab(dx, *it);
00352             if ( res>=0 ) {
00353                 if (prevBlk) {
00354                     double xMax = prevBlk->xMax - pleft;
00355                     res = par.findNbTabs(res, xMax);
00356                     if ( res==0 ) continue;
00357                 } else res++;
00358                 // no tabs for first block in AlignCenter and AlignRight
00359                 // if smart mode
00360                 if ( prevBlk || !_data.options().smart
00361                      || (par.align!=AlignCenter && par.align!=AlignRight) ) {
00362                     Block b;
00363                     b.font = static_cast<String *>(blk->strings)->font();
00364                     for (uint k=0; k<(uint)res; k++) b.text += '\t';
00365                     par.blocks.push_back(b);
00366                 }
00367             }
00368 
00369             // text & format
00370             for (TextString *str = blk->strings; str; str = str->next) {
00371                 Block b;
00372                 for (uint k = 0; k<uint(str->len); k++)
00373                     b.text += QChar(str->text[k]);
00374                 if (str->spaceAfter) b.text += ' ';
00375                 String *fstr = static_cast<String *>(str);
00376                 b.font = fstr->font();
00377                 b.link = fstr->link;
00378                 par.blocks.push_back(b);
00379                 lineHeight = kMax(lineHeight, b.font.height());
00380             }
00381 
00382             prevBlk = blk;
00383         }
00384 
00385         offset += lineHeight;
00386     }
00387 }
00388 
00389 FontFamily Page::checkSpecial(QChar &c, const Font &font) const
00390 {
00391     Unicode res = 0;
00392     switch ( PDFImport::checkSpecial(c.unicode(), res) ) {
00393     case Bullet:
00394         kdDebug(30516) << "found bullet" << endl;
00395         // #### FIXME : if list, use a COUNTER
00396         // temporarly replace by symbol
00397         c = res;
00398         return Symbol;
00399     case SuperScript:
00400         kdDebug(30516) << "found superscript" << endl;
00401         // #### FIXME
00402         break;
00403     case LatexSpecial:
00404         if ( !font.isLatex() ) break;
00405         kdDebug(30516) << "found latex special" << endl;
00406         return Times;
00407     case SpecialSymbol:
00408         kdDebug(30516) << "found symbol=" << c.unicode() << endl;
00409         return Times;
00410         //return Symbol;
00411     default:
00412         break;
00413     }
00414 
00415     return Nb_Family;
00416 }
00417 
00418 void Page::checkSpecialChars(Paragraph &par) const
00419 {
00420     QValueList<Block> blocks;
00421     for (uint k=0; k<par.blocks.size(); k++) {
00422         const Block &b = par.blocks[k];
00423         QString res;
00424 //            kdDebug(30516) << "check \"" << b.text << "\"" << endl;
00425         for (uint l=0; l<b.text.length(); l++) {
00426             QChar c = b.text[l];
00427             FontFamily family = checkSpecial(c, b.font);
00428             if ( family==Nb_Family ) res += c;
00429             else {
00430                 if ( !res.isEmpty() ) {
00431                     blocks.push_back(b);
00432                     blocks.back().text = res;
00433                     res = QString::null;
00434                 }
00435                 blocks.push_back(b);
00436                 blocks.back().font.setFamily(family);
00437                 blocks.back().text = c;
00438             }
00439         }
00440         if ( !res.isEmpty() ) {
00441             blocks.push_back(b);
00442             blocks.back().text = res;
00443         }
00444     }
00445     par.blocks = blocks;
00446 }
00447 
00448 void Page::coalesce(Paragraph &par) const
00449 {
00450     QValueList<Block> blocks;
00451     blocks.push_back(par.blocks[0]);
00452     for (uint k=1; k<par.blocks.size(); k++) {
00453         const Block &b = par.blocks[k];
00454         if ( b.link==blocks.back().link && b.font==blocks.back().font )
00455             blocks.back().text += b.text;
00456         else blocks.push_back(b);
00457     }
00458     par.blocks = blocks;
00459 }
00460 
00461 void Page::prepare()
00462 {
00463     TIME_START("associate links");
00464     for (Link *link=_links.first(); link; link=_links.next()) {
00465         const DRect &r = link->rect();
00466 //        kdDebug(30516) << "link " << r.toString() << endl;
00467         for (TextLine *line = lines; line; line = line->next)
00468             for (TextBlock *blk = line->blocks; blk; blk = blk->next)
00469                 for (TextString *str = blk->strings; str; str = str->next) {
00470                     String *fstr = static_cast<String *>(str);
00471                     DRect sr = fstr->rect();
00472 //                    kdDebug(30516) << "str " << sr.toString() << " "
00473 //                                   << r.isInside(sr) << endl;
00474                     if ( r.isInside(sr) ) fstr->link = link;
00475                 }
00476     }
00477     TIME_END;
00478 
00479     TIME_START("init paragraphs");
00480     for (uint i=0; i<_pars.size(); i++) {
00481         initParagraph(_pars[i]);
00482 
00483         // special case for wide and centered one liner without tab
00484         if ( _pars[i].align==AlignBlock && _pars[i].hasOneLine()
00485              && _pars[i].tabs.size()==0
00486              && (_pars.size()==1
00487                  || (i!=0 && _pars[i-1].align==AlignCenter)
00488                  || ((i+1)!=_pars.size() && _pars[i+1].align==AlignCenter)) )
00489             _pars[i].align = AlignCenter;
00490     }
00491     TIME_END;
00492 
00493     TIME_START("fill paragraphs");
00494     uint begin = 0;
00495     if ( hasHeader() ) {
00496         double offset = _rects[Header].top();
00497         fillParagraph(_pars[0], offset);
00498         begin++;
00499     }
00500     uint end = _pars.size();
00501     if ( hasFooter() ) {
00502         double offset = _rects[Footer].top();
00503         end--;
00504         fillParagraph(_pars[end], offset);
00505     }
00506     double offset = _rects[Body].top();
00507     for (uint i=begin; i<end; i++)
00508         fillParagraph(_pars[i], offset);
00509     TIME_END;
00510 
00511     TIME_START("check for special chars");
00512     for (uint i=0; i<_pars.size(); i++)
00513         checkSpecialChars(_pars[i]);
00514     TIME_END;
00515 
00516     // this is not really required...
00517     TIME_START("coalesce formats");
00518     for (uint i=0; i<_pars.size(); i++)
00519         coalesce(_pars[i]);
00520     TIME_END;
00521 
00522     // if no paragraph : add an empty one
00523     if ( _pars.size()==0 ) {
00524         Block b;
00525         Paragraph par(0, 0);
00526         par.blocks.push_back(b);
00527         _pars.push_back(par);
00528     }
00529 }
00530 
00531 void Page::dump(const Paragraph &par)
00532 {
00533     QValueVector<QDomElement> layouts;
00534     QValueVector<QDomElement> formats;
00535 
00536     // tabulations
00537     for (uint k=0; k<par.tabs.size(); k++) {
00538         QDomElement element = par.tabs[k].createElement(_data);
00539         layouts.push_back(element);
00540     }
00541 
00542     // indents
00543     if ( !_data.options().smart || par.align!=AlignCenter ) {
00544         QDomElement element = _data.createElement("INDENTS");
00545         element.setAttribute("left", par.leftIndent);
00546         double delta = par.firstIndent - par.leftIndent;
00547         if ( !equal(delta, 0) ) element.setAttribute("first", delta);
00548         layouts.push_back(element);
00549     }
00550 
00551     // offset before
00552     if ( par.offset>0 ) {
00553         QDomElement element = _data.createElement("OFFSETS");
00554         element.setAttribute("before", par.offset);
00555         layouts.push_back(element);
00556     }
00557 
00558     // flow
00559     if (_data.options().smart) {
00560         QString flow;
00561 //        kdDebug(30516) << "flow=" << par.align << endl;
00562         switch (par.align) {
00563         case AlignLeft: break;
00564         case AlignRight: flow = "right"; break;
00565         case AlignCenter: flow = "center"; break;
00566         case AlignBlock: flow = "justify"; break;
00567         }
00568         if ( !flow.isEmpty() ) {
00569             QDomElement element = _data.createElement("FLOW");
00570             element.setAttribute("align", flow.utf8());
00571             layouts.push_back(element);
00572         }
00573     }
00574 
00575     // text and formats
00576     QString text;
00577     uint pos = 0;
00578     for (uint k=0; k<par.blocks.size(); k++) {
00579         const Block &b = par.blocks[k];
00580         text += (b.link ? "#" : b.text);
00581         uint len = (b.link ? 1 : b.text.length());
00582         QDomElement element = _data.createElement("FORMAT");
00583         QDomDocument document = _data.document();
00584         bool r = b.font.format(document, element, pos, len);
00585         if (b.link) b.link->format(document, element, pos, b.text);
00586         if ( r || b.link ) formats.push_back(element);
00587         pos += len;
00588     }
00589 
00590     _data.createParagraph(text, par.type, layouts, formats);
00591 }
00592 
00593 void Page::dump()
00594 {
00595     prepare();
00596 
00597     TIME_START("dump XML");
00598     for (uint i=0; i<_pars.size(); i++)
00599         dump(_pars[i]);
00600     TIME_END;
00601 }
00602 
00603 } // namespace
KDE Home | KDE Accessibility Home | Description of Access Keys