1    /* ====================================================================
2     * The Apache Software License, Version 1.1
3     *
4     * Copyright (c) 2002 The Apache Software Foundation.  All rights
5     * reserved.
6     *
7     * Redistribution and use in source and binary forms, with or without
8     * modification, are permitted provided that the following conditions
9     * are met:
10    *
11    * 1. Redistributions of source code must retain the above copyright
12    *    notice, this list of conditions and the following disclaimer.
13    *
14    * 2. Redistributions in binary form must reproduce the above copyright
15    *    notice, this list of conditions and the following disclaimer in
16    *    the documentation and/or other materials provided with the
17    *    distribution.
18    *
19    * 3. The end-user documentation included with the redistribution,
20    *    if any, must include the following acknowledgment:
21    *       "This product includes software developed by the
22    *        Apache Software Foundation (http://www.apache.org/)."
23    *    Alternately, this acknowledgment may appear in the software itself,
24    *    if and wherever such third-party acknowledgments normally appear.
25    *
26    * 4. The names "Apache" and "Apache Software Foundation" and
27    *    "Apache POI" must not be used to endorse or promote products
28    *    derived from this software without prior written permission. For
29    *    written permission, please contact apache@apache.org.
30    *
31    * 5. Products derived from this software may not be called "Apache",
32    *    "Apache POI", nor may "Apache" appear in their name, without
33    *    prior written permission of the Apache Software Foundation.
34    *
35    * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
36    * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
37    * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
38    * DISCLAIMED.  IN NO EVENT SHALL THE APACHE SOFTWARE FOUNDATION OR
39    * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
40    * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
41    * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
42    * USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
43    * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
44    * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
45    * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
46    * SUCH DAMAGE.
47    * ====================================================================
48    *
49    * This software consists of voluntary contributions made by many
50    * individuals on behalf of the Apache Software Foundation.  For more
51    * information on the Apache Software Foundation, please see
52    * <http://www.apache.org/>.
53    */
54   
55   package org.apache.poi.hssf.record;
56   
57   import org.apache.poi.util.BinaryTree;
58   import org.apache.poi.util.LittleEndian;
59   import org.apache.poi.util.LittleEndianConsts;
60   
61   /**
62    * Handles the task of deserializing a SST string.  The two main entry points are
63    *
64    * @author Glen Stampoultzis (glens at apache.org)
65    */
66   class SSTDeserializer
67   {
68   
69       private BinaryTree strings;
70       /** this is the number of characters we expect in the first sub-record in a subsequent continuation record */
71       private int continuationExpectedChars;
72       /** this is the string we were working on before hitting the end of the current record. This string is NOT finished. */
73       private String unfinishedString;
74       /** this is true if the string uses wide characters */
75       private boolean wideChar;
76       /** this is true if the string is a rich text string */
77       private boolean richText;
78       /** this is true if the string is a far east string or some other wierd string */
79       private boolean extendedText;
80       /** Number of formatting runs in this rich text field */
81       private short runCount;
82       /** Number of characters in current string */
83       private int charCount;
84       private int extensionLength;
85   
86   
87       public SSTDeserializer( BinaryTree strings )
88       {
89           this.strings = strings;
90           initVars();
91       }
92   
93       private void initVars()
94       {
95           runCount = 0;
96           continuationExpectedChars = 0;
97           unfinishedString = "";
98   //        bytesInCurrentSegment = 0;
99   //        stringDataOffset = 0;
100          wideChar = false;
101          richText = false;
102          extendedText = false;
103      }
104  
105      /**
106       * This is the starting point where strings are constructed.  Note that
107       * strings may span across multiple continuations. Read the SST record
108       * carefully before beginning to hack.
109       */
110      public void manufactureStrings( final byte[] data, final int initialOffset, short dataSize )
111      {
112          initVars();
113  
114          int offset = initialOffset;
115          while ( ( offset - initialOffset ) < dataSize )
116          {
117              int remaining = dataSize - offset + initialOffset;
118  
119              if ( ( remaining > 0 ) && ( remaining < LittleEndianConsts.SHORT_SIZE ) )
120              {
121                  throw new RecordFormatException( "Cannot get length of the last string in SSTRecord" );
122              }
123              if ( remaining == LittleEndianConsts.SHORT_SIZE )
124              {
125                  setContinuationExpectedChars( LittleEndian.getUShort( data, offset ) );
126                  unfinishedString = "";
127                  break;
128              }
129              charCount = LittleEndian.getUShort( data, offset );
130              readStringHeader( data, offset );
131              boolean stringContinuesOverContinuation = remaining < totalStringSize();
132              if ( stringContinuesOverContinuation )
133              {
134                  int remainingBytes = ( initialOffset + dataSize ) - offset - stringHeaderOverhead();
135                  setContinuationExpectedChars( charCount - calculateCharCount( remainingBytes ) );
136                  charCount -= getContinuationExpectedChars();
137              }
138              else
139              {
140                  setContinuationExpectedChars( 0 );
141              }
142              processString( data, offset, charCount );
143              offset += totalStringSize();
144              if ( getContinuationExpectedChars() != 0 )
145              {
146                  break;
147              }
148          }
149      }
150  
151  //    private void dump( final byte[] data, int offset, int length )
152  //    {
153  //        try
154  //        {
155  //            System.out.println( "------------------- SST DUMP -------------------------" );
156  //            HexDump.dump( (byte[]) data, offset, System.out, offset, length );
157  //        }
158  //        catch ( IOException e )
159  //        {
160  //        }
161  //        catch ( ArrayIndexOutOfBoundsException e )
162  //        {
163  //        }
164  //        catch ( IllegalArgumentException e )
165  //        {
166  //        }
167  //    }
168  
169      /**
170       * Detemines the option types for the string (ie, compressed or uncompressed unicode, rich text string or
171       * plain string etc) and calculates the length and offset for the string.
172       *
173       */
174      private void readStringHeader( final byte[] data, final int index )
175      {
176  
177          byte optionFlag = data[index + LittleEndianConsts.SHORT_SIZE];
178  
179          wideChar = ( optionFlag & 1 ) == 1;
180          extendedText = ( optionFlag & 4 ) == 4;
181          richText = ( optionFlag & 8 ) == 8;
182          runCount = 0;
183          if ( richText )
184          {
185              runCount = LittleEndian.getShort( data, index + SSTRecord.STRING_MINIMAL_OVERHEAD );
186          }
187          extensionLength = 0;
188          if ( extendedText )
189          {
190              extensionLength = LittleEndian.getInt( data, index + SSTRecord.STRING_MINIMAL_OVERHEAD
191                      + (richText ? LittleEndianConsts.SHORT_SIZE : 0) );
192          }
193  
194      }
195  
196  
197      /**
198       * Reads a string or the first part of a string.
199       *
200       * @param characters the number of characters to write.
201       *
202       * @return the number of bytes written.
203       */
204      private int processString( final byte[] data, final int dataIndex, final int characters )
205      {
206  
207          // length is the length we store it as.  not the length that is read.
208          int length = SSTRecord.STRING_MINIMAL_OVERHEAD + calculateByteCount( characters );
209          byte[] unicodeStringBuffer = new byte[length];
210  
211          int offset = 0;
212  
213          // Set the length in characters
214          LittleEndian.putUShort( unicodeStringBuffer, offset, characters );
215          offset += LittleEndianConsts.SHORT_SIZE;
216          // Set the option flags
217          unicodeStringBuffer[offset] = data[dataIndex + offset];
218          // Copy in the string data
219          int bytesRead = unicodeStringBuffer.length - SSTRecord.STRING_MINIMAL_OVERHEAD;
220          arraycopy( data, dataIndex + stringHeaderOverhead(), unicodeStringBuffer, SSTRecord.STRING_MINIMAL_OVERHEAD, bytesRead );
221          // Create the unicode string
222          UnicodeString string = new UnicodeString( UnicodeString.sid,
223                  (short) unicodeStringBuffer.length,
224                  unicodeStringBuffer );
225  
226          if ( isStringFinished() )
227          {
228              Integer integer = new Integer( strings.size() );
229              addToStringTable( strings, integer, string );
230          }
231          else
232          {
233              unfinishedString = string.getString();
234          }
235  
236          return bytesRead;
237      }
238  
239      private boolean isStringFinished()
240      {
241          return getContinuationExpectedChars() == 0;
242      }
243  
244      /**
245       * Okay, we are doing some major cheating here. Because we can't handle rich text strings properly
246       * we end up getting duplicate strings.  To get around this I'm doing two things: 1. Converting rich
247       * text to normal text and 2. If there's a duplicate I'm adding a space onto the end.  Sneaky perhaps
248       * but it gets the job done until we can handle this a little better.
249       */
250      static public void addToStringTable( BinaryTree strings, Integer integer, UnicodeString string )
251      {
252  
253          if ( string.isRichText() )
254              string.setOptionFlags( (byte) ( string.getOptionFlags() & ( ~8 ) ) );
255          if ( string.isExtendedText() )
256              string.setOptionFlags( (byte) ( string.getOptionFlags() & ( ~4 ) ) );
257  
258          boolean added = false;
259          while ( added == false )
260          {
261              try
262              {
263                  strings.put( integer, string );
264                  added = true;
265              }
266              catch ( Exception ignore )
267              {
268                  string.setString( string.getString() + " " );
269              }
270          }
271  
272      }
273  
274  
275      private int calculateCharCount( final int byte_count )
276      {
277          return byte_count / ( wideChar ? LittleEndianConsts.SHORT_SIZE : LittleEndianConsts.BYTE_SIZE );
278      }
279  
280      /**
281       * Process a Continue record. A Continue record for an SST record
282       * contains the same kind of data that the SST record contains,
283       * with the following exceptions:
284       * <P>
285       * <OL>
286       * <LI>The string counts at the beginning of the SST record are
287       *     not in the Continue record
288       * <LI>The first string in the Continue record might NOT begin
289       *     with a size. If the last string in the previous record is
290       *     continued in this record, the size is determined by that
291       *     last string in the previous record; the first string will
292       *     begin with a flag byte, followed by the remaining bytes (or
293       *     words) of the last string from the previous
294       *     record. Otherwise, the first string in the record will
295       *     begin with a string length
296       * </OL>
297       *
298       * @param record the Continue record's byte data
299       */
300      public void processContinueRecord( final byte[] record )
301      {
302          if ( isStringFinished() )
303          {
304              initVars();
305              manufactureStrings( record, 0, (short) record.length );
306          }
307          else
308          {
309              // reset the wide bit because that can change across a continuation. the fact that it's
310              // actually rich text doesn't change across continuations even though the rich text
311              // may on longer be set in the "new" option flag.  confusing huh?
312              wideChar = ( record[0] & 1 ) == 1;
313  
314              if ( stringSpansContinuation( record.length - LittleEndianConsts.BYTE_SIZE ) )
315              {
316                  processEntireContinuation( record );
317              }
318              else
319              {
320                  readStringRemainder( record );
321              }
322          }
323  
324      }
325  
326      /**
327       * Reads the remainder string and any subsequent strings from the continuation record.
328       *
329       * @param record  The entire continuation record data.
330       */
331      private void readStringRemainder( final byte[] record )
332      {
333          int stringRemainderSizeInBytes = calculateByteCount( getContinuationExpectedChars() );
334  //        stringDataOffset = LittleEndianConsts.BYTE_SIZE;
335          byte[] unicodeStringData = new byte[SSTRecord.STRING_MINIMAL_OVERHEAD
336                  + calculateByteCount( getContinuationExpectedChars() )];
337  
338          // write the string length
339          LittleEndian.putShort( unicodeStringData, 0, (short) getContinuationExpectedChars() );
340  
341          // write the options flag
342          unicodeStringData[LittleEndianConsts.SHORT_SIZE] = createOptionByte( wideChar, richText, extendedText );
343  
344          // copy the bytes/words making up the string; skipping
345          // past all the overhead of the str_data array
346          arraycopy( record, LittleEndianConsts.BYTE_SIZE, unicodeStringData,
347                  SSTRecord.STRING_MINIMAL_OVERHEAD,
348                  unicodeStringData.length - SSTRecord.STRING_MINIMAL_OVERHEAD );
349  
350          // use special constructor to create the final string
351          UnicodeString string = new UnicodeString( UnicodeString.sid,
352                  (short) unicodeStringData.length, unicodeStringData,
353                  unfinishedString );
354          Integer integer = new Integer( strings.size() );
355  
356          addToStringTable( strings, integer, string );
357  
358          int newOffset = offsetForContinuedRecord( stringRemainderSizeInBytes );
359          manufactureStrings( record, newOffset, (short) ( record.length - newOffset ) );
360      }
361  
362      /**
363       * Calculates the size of the string in bytes based on the character width
364       */
365      private int stringSizeInBytes()
366      {
367          return calculateByteCount( charCount );
368      }
369  
370      /**
371       * Calculates the size of the string in byes.  This figure includes all the over
372       * heads for the string.
373       */
374      private int totalStringSize()
375      {
376          return stringSizeInBytes()
377                  + stringHeaderOverhead()
378                  + LittleEndianConsts.INT_SIZE * runCount
379                  + extensionLength;
380      }
381  
382      private int stringHeaderOverhead()
383      {
384          return SSTRecord.STRING_MINIMAL_OVERHEAD
385                  + ( richText ? LittleEndianConsts.SHORT_SIZE : 0 )
386                  + ( extendedText ? LittleEndianConsts.INT_SIZE : 0 );
387      }
388  
389      private int offsetForContinuedRecord( int stringRemainderSizeInBytes )
390      {
391          return stringRemainderSizeInBytes + LittleEndianConsts.BYTE_SIZE
392                  + runCount * LittleEndianConsts.INT_SIZE + extensionLength;
393      }
394  
395      private byte createOptionByte( boolean wideChar, boolean richText, boolean farEast )
396      {
397          return (byte) ( ( wideChar ? 1 : 0 ) + ( farEast ? 4 : 0 ) + ( richText ? 8 : 0 ) );
398      }
399  
400      /**
401       * If the continued record is so long is spans into the next continue then
402       * simply suck the remaining string data into the existing <code>unfinishedString</code>.
403       *
404       * @param record    The data from the continuation record.
405       */
406      private void processEntireContinuation( final byte[] record )
407      {
408          // create artificial data to create a UnicodeString
409          int dataLengthInBytes = record.length - LittleEndianConsts.BYTE_SIZE;
410          byte[] unicodeStringData = new byte[record.length + LittleEndianConsts.SHORT_SIZE];
411  
412          LittleEndian.putShort( unicodeStringData, (byte) 0, (short) calculateCharCount( dataLengthInBytes ) );
413          arraycopy( record, 0, unicodeStringData, LittleEndianConsts.SHORT_SIZE, record.length );
414          UnicodeString ucs = new UnicodeString( UnicodeString.sid, (short) unicodeStringData.length, unicodeStringData );
415  
416          unfinishedString = unfinishedString + ucs.getString();
417          setContinuationExpectedChars( getContinuationExpectedChars() - calculateCharCount( dataLengthInBytes ) );
418      }
419  
420      private boolean stringSpansContinuation( int continuationSizeInBytes )
421      {
422          return calculateByteCount( getContinuationExpectedChars() ) > continuationSizeInBytes;
423      }
424  
425      /**
426       * @return the number of characters we expect in the first
427       *         sub-record in a subsequent continuation record
428       */
429  
430      int getContinuationExpectedChars()
431      {
432          return continuationExpectedChars;
433      }
434  
435      private void setContinuationExpectedChars( final int count )
436      {
437          continuationExpectedChars = count;
438      }
439  
440      private int calculateByteCount( final int character_count )
441      {
442          return character_count * ( wideChar ? LittleEndianConsts.SHORT_SIZE : LittleEndianConsts.BYTE_SIZE );
443      }
444  
445  
446      /**
447       * Copies an array from the specified source array, beginning at the
448       * specified position, to the specified position of the destination array.
449       * A subsequence of array components are copied from the source
450       * array referenced by <code>src</code> to the destination array
451       * referenced by <code>dst</code>. The number of components copied is
452       * equal to the <code>length</code> argument. The components at
453       * positions <code>srcOffset</code> through
454       * <code>srcOffset+length-1</code> in the source array are copied into
455       * positions <code>dstOffset</code> through
456       * <code>dstOffset+length-1</code>, respectively, of the destination
457       * array.
458       * <p>
459       * If the <code>src</code> and <code>dst</code> arguments refer to the
460       * same array object, then the copying is performed as if the
461       * components at positions <code>srcOffset</code> through
462       * <code>srcOffset+length-1</code> were first copied to a temporary
463       * array with <code>length</code> components and then the contents of
464       * the temporary array were copied into positions
465       * <code>dstOffset</code> through <code>dstOffset+length-1</code> of the
466       * destination array.
467       * <p>
468       * If <code>dst</code> is <code>null</code>, then a
469       * <code>NullPointerException</code> is thrown.
470       * <p>
471       * If <code>src</code> is <code>null</code>, then a
472       * <code>NullPointerException</code> is thrown and the destination
473       * array is not modified.
474       * <p>
475       * Otherwise, if any of the following is true, an
476       * <code>ArrayStoreException</code> is thrown and the destination is
477       * not modified:
478       * <ul>
479       * <li>The <code>src</code> argument refers to an object that is not an
480       *     array.
481       * <li>The <code>dst</code> argument refers to an object that is not an
482       *     array.
483       * <li>The <code>src</code> argument and <code>dst</code> argument refer to
484       *     arrays whose component types are different primitive types.
485       * <li>The <code>src</code> argument refers to an array with a primitive
486       *     component type and the <code>dst</code> argument refers to an array
487       *     with a reference component type.
488       * <li>The <code>src</code> argument refers to an array with a reference
489       *     component type and the <code>dst</code> argument refers to an array
490       *     with a primitive component type.
491       * </ul>
492       * <p>
493       * Otherwise, if any of the following is true, an
494       * <code>IndexOutOfBoundsException</code> is
495       * thrown and the destination is not modified:
496       * <ul>
497       * <li>The <code>srcOffset</code> argument is negative.
498       * <li>The <code>dstOffset</code> argument is negative.
499       * <li>The <code>length</code> argument is negative.
500       * <li><code>srcOffset+length</code> is greater than
501       *     <code>src.length</code>, the length of the source array.
502       * <li><code>dstOffset+length</code> is greater than
503       *     <code>dst.length</code>, the length of the destination array.
504       * </ul>
505       * <p>
506       * Otherwise, if any actual component of the source array from
507       * position <code>srcOffset</code> through
508       * <code>srcOffset+length-1</code> cannot be converted to the component
509       * type of the destination array by assignment conversion, an
510       * <code>ArrayStoreException</code> is thrown. In this case, let
511       * <b><i>k</i></b> be the smallest nonnegative integer less than
512       * length such that <code>src[srcOffset+</code><i>k</i><code>]</code>
513       * cannot be converted to the component type of the destination
514       * array; when the exception is thrown, source array components from
515       * positions <code>srcOffset</code> through
516       * <code>srcOffset+</code><i>k</i><code>-1</code>
517       * will already have been copied to destination array positions
518       * <code>dstOffset</code> through
519       * <code>dstOffset+</code><i>k</I><code>-1</code> and no other
520       * positions of the destination array will have been modified.
521       * (Because of the restrictions already itemized, this
522       * paragraph effectively applies only to the situation where both
523       * arrays have component types that are reference types.)
524       *
525       * @param      src          the source array.
526       * @param      src_position start position in the source array.
527       * @param      dst          the destination array.
528       * @param      dst_position pos   start position in the destination data.
529       * @param      length       the number of array elements to be copied.
530       * @exception  IndexOutOfBoundsException  if copying would cause
531       *               access of data outside array bounds.
532       * @exception  ArrayStoreException  if an element in the <code>src</code>
533       *               array could not be stored into the <code>dest</code> array
534       *               because of a type mismatch.
535       * @exception  NullPointerException if either <code>src</code> or
536       *               <code>dst</code> is <code>null</code>.
537       */
538      private void arraycopy( byte[] src, int src_position,
539                              byte[] dst, int dst_position,
540                              int length )
541      {
542          System.arraycopy( src, src_position, dst, dst_position, length );
543      }
544  
545      /**
546       * @return the unfinished string
547       */
548      String getUnfinishedString()
549      {
550          return unfinishedString;
551      }
552  
553      /**
554       * @return true if current string uses wide characters
555       */
556      boolean isWideChar()
557      {
558          return wideChar;
559      }
560  
561  
562  }
563