// include/Dunzipper.hh
// This file is part of libpbe; see http://svn.chezphil.org/libpbe/
// (C) 2011 Philip Endecott

// This program is free software; you can redistribute it and/or modify
// it under the terms of the GNU General Public License as published by
// the Free Software Foundation; either version 2 of the License, or
// any later version.
//
// This program is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
// GNU General Public License for more details.
//
// You should have received a copy of the GNU General Public License
// along with this program; if not, write to the Free Software
// Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.

#ifndef pbe_Dunzipper_hh
#define pbe_Dunzipper_hh

#include <FileDescriptor.hh>

#include <boost/scoped_array.hpp>

#include <zlib.h>

#include <string>
#include <vector>
#include <algorithm>

// Decompressor for "dictzip" files.  These are gzip files with additional 
// header information that makes them randomly accessible.


namespace pbe {


class Dunzipper {

  FileDescriptor fd;
  z_stream_s gz;

  size_t chunk_size;  // Decompressed size of chunks; all the same.
  typedef std::vector<size_t> chunk_offsets_t;
  chunk_offsets_t chunk_offsets;  // Offsets to compressed chunks in source file.

  size_t total_decompressed_size_;

  void get_chunk(size_t chunk, char* dest)
  {
    // Decompress the specified chunk into memory at dest.
//std::cout << "getting chunk " << chunk << "\n";

    size_t offset = chunk_offsets[chunk];
    fd.seek(offset);

    gz.avail_in = 0;
    gz.next_out = reinterpret_cast<Bytef*>(dest);
    gz.avail_out = chunk_size;

    char inbuf[8192];
    while (gz.avail_out>0) {
      if (gz.avail_in==0) {
        gz.next_in = reinterpret_cast<Bytef*>(inbuf);
        gz.avail_in = fd.read(inbuf,sizeof(inbuf));
      }
      int r = inflate(&gz,Z_SYNC_FLUSH);
      if (r==Z_STREAM_END) break;
      switch (r) {
        case Z_OK:           break;
        case Z_NEED_DICT:    throw "Z_NEED_DICT";
        case Z_DATA_ERROR:   throw "Z_DATA_ERROR";
        case Z_STREAM_ERROR: throw "Z_STREAM_ERROR";
        case Z_MEM_ERROR:    throw std::bad_alloc();
        default:             throw "misc zlib error";
      }
    }

    inflateReset(&gz);
  }


public:

  Dunzipper(std::string fn):
    fd(fn,FileDescriptor::read_only)
  {
    // Open the file and read the header, which contains the table of offsets 
    // to the individual chunks.
    // (zlib is used to read the header.  With hindsite, it might have been easier 
    // to do it manually; the format is not complex.)

    // Use the default allocator.
    gz.zalloc = Z_NULL;
    gz.zfree = Z_NULL;
    gz.opaque = Z_NULL;

    char inbuf[4096];
    char outbuf[40960];  // We probably don't actually save any decompressed data...

    gz.next_in = reinterpret_cast<Bytef*>(inbuf);
    gz.avail_in = 0;

    gz.next_out = reinterpret_cast<Bytef*>(outbuf);
    gz.avail_out = sizeof(outbuf);

    int r = inflateInit2(&gz, 15+16);  // 15 = windowbits; this is the default;
                                       // adding 16 (ARGH!) magically makes it
                                       // recognise gzip format.
    switch (r) {
      case Z_OK:            break;
      case Z_MEM_ERROR:     throw std::bad_alloc();
      case Z_VERSION_ERROR: throw "Z_VERSION_ERROR";
      default:              throw "misc zlib error";
    }

    gz_header header;
    header.name = Z_NULL;
    header.comment = Z_NULL;
    header.extra_max = 65536;
    boost::scoped_array<Bytef> header_extra_p(new Bytef[header.extra_max]);
    header.extra = header_extra_p.get();
    r = inflateGetHeader(&gz, &header);
    switch (r) {
      case Z_OK:            break;
      case Z_MEM_ERROR:     throw std::bad_alloc();
      case Z_STREAM_ERROR:  throw "Z_STREAM_ERROR";
      default:              throw "misc zlib error";
    }

    size_t offset = 0;  // We need to know how long the header is, so we count the bytes
                        // as they are processed.

    while (header.done==0) {
      if (gz.avail_in==0) {
        gz.next_in = reinterpret_cast<Bytef*>(inbuf);
        gz.avail_in = fd.read(inbuf,sizeof(inbuf));
        offset += gz.avail_in;
      }
      if (gz.avail_out==0) {
        gz.next_out = reinterpret_cast<Bytef*>(outbuf);
        gz.avail_out = sizeof(outbuf);
      }

      int r = inflate(&gz,Z_BLOCK);  // Z_BLOCK means it should stop at the end of the header.
      switch (r) {
        case Z_OK:           break;
        case Z_NEED_DICT:    throw "Z_NEED_DICT";
        case Z_DATA_ERROR:   throw "Z_DATA_ERROR";
        case Z_STREAM_ERROR: throw "Z_STREAM_ERROR";
        case Z_MEM_ERROR:    throw std::bad_alloc();
        default:             throw "misc zlib error";
      }
    }
    offset -= gz.avail_in;
    switch (header.done) {
      case -1:  throw "No gzip header";
    }

    if (!header.extra) throw "No extra field in header - not dictzip";

    struct {
      char magic[2];
      uint16_t len;
      uint16_t ver;
      uint16_t chlen;
      uint16_t chcnt;
    } dictzip_header;

    if (header.extra_len < sizeof(dictzip_header)) throw "Not enough data for dictzip header";

    memcpy(&dictzip_header, header_extra_p.get(), sizeof(dictzip_header));

    if (dictzip_header.magic[0] != 'R' || dictzip_header.magic[1] != 'A') throw "Wrong dictzip magic";
    if (dictzip_header.ver != 1) throw "Wrong dictzip version";

    chunk_size = dictzip_header.chlen;

    // The header contains the compressed chunk sizes, but we want the offsets.
    for (int i=0; i<dictzip_header.chcnt; ++i) {
      chunk_offsets.push_back(offset);
      uint16_t s;
      memcpy(&s, header_extra_p.get() + sizeof(dictzip_header) + i*sizeof(uint16_t), sizeof(s));
      offset += s;
    }

    // This doesn't allow for the final partial chunk:
    total_decompressed_size_ = chunk_size * dictzip_header.chcnt;

    inflateEnd(&gz);

    // Prepare for reading chunks, which are "raw zlib" format i.e. no header.
    inflateInit2(&gz, -15);
  }


  ~Dunzipper() {
    inflateEnd(&gz);
  }


  void read(size_t offset, size_t length, char* dest)
  {
    size_t first_chunk = offset / chunk_size;
    size_t skip_bytes = offset % chunk_size;
    size_t last_chunk = (offset+length-1) / chunk_size;

    for (size_t i = first_chunk; i <= last_chunk; ++i) {
      if (skip_bytes) {
        char buf[chunk_size];
        get_chunk(i,buf);
        size_t l = std::min(chunk_size-skip_bytes, length);
        memcpy(dest, buf+skip_bytes, l);
        dest += l;
        length -= l;
        skip_bytes = 0;

      } else if (length < chunk_size) {
        char buf[chunk_size];
        get_chunk(i,buf);
        memcpy(dest, buf, length);
        dest += length;
        length = 0;

      } else {
        get_chunk(i,dest);
        dest += chunk_size;
        length -= chunk_size;
      }
    }

  }


  size_t total_decompressed_size() const {
    return total_decompressed_size_;
  }

};



};


#endif

