misc/libphysfs/lzma/CPP/7zip/Compress/LZMA_Alone/LzmaBench.cpp
author nemo
Mon, 10 Apr 2017 12:06:43 -0400
changeset 12213 bb5522e88ab2
permissions -rw-r--r--
bulk copy of latest physfs to our misc/libphysfs since this seems to fix an off-by-1 error reliably hit in readln read of 1 byte probably introduced in the addition of the buffered read. Whether this is excessive or whether libphysfs should even be maintained by us is another matter. But at least we shouldn't crash

// LzmaBench.cpp

#include "StdAfx.h"

#include "LzmaBench.h"

#ifndef _WIN32
#define USE_POSIX_TIME
#define USE_POSIX_TIME2
#endif

#ifdef USE_POSIX_TIME
#include <time.h>
#ifdef USE_POSIX_TIME2
#include <sys/time.h>
#endif
#endif

#ifdef _WIN32
#define USE_ALLOCA
#endif

#ifdef USE_ALLOCA
#ifdef _WIN32
#include <malloc.h>
#else
#include <stdlib.h>
#endif
#endif

extern "C" 
{ 
#include "../../../../C/Alloc.h"
#include "../../../../C/7zCrc.h"
}
#include "../../../Common/MyCom.h"
#include "../../ICoder.h"

#ifdef BENCH_MT
#include "../../../Windows/Thread.h"
#include "../../../Windows/Synchronization.h"
#endif

#ifdef EXTERNAL_LZMA
#include "../../../Windows/PropVariant.h"
#else
#include "../LZMA/LZMADecoder.h"
#include "../LZMA/LZMAEncoder.h"
#endif

static const UInt32 kUncompressMinBlockSize = 1 << 26;
static const UInt32 kAdditionalSize = (1 << 16);
static const UInt32 kCompressedAdditionalSize = (1 << 10);
static const UInt32 kMaxLzmaPropSize = 5;

class CBaseRandomGenerator
{
  UInt32 A1;
  UInt32 A2;
public:
  CBaseRandomGenerator() { Init(); }
  void Init() { A1 = 362436069; A2 = 521288629;}
  UInt32 GetRnd() 
  {
    return 
      ((A1 = 36969 * (A1 & 0xffff) + (A1 >> 16)) << 16) +
      ((A2 = 18000 * (A2 & 0xffff) + (A2 >> 16)) );
  }
};

class CBenchBuffer
{
public:
  size_t BufferSize;
  Byte *Buffer;
  CBenchBuffer(): Buffer(0) {} 
  virtual ~CBenchBuffer() { Free(); }
  void Free() 
  { 
    ::MidFree(Buffer);
    Buffer = 0;
  }
  bool Alloc(size_t bufferSize) 
  {
    if (Buffer != 0 && BufferSize == bufferSize)
      return true;
    Free();
    Buffer = (Byte *)::MidAlloc(bufferSize);
    BufferSize = bufferSize;
    return (Buffer != 0);
  }
};

class CBenchRandomGenerator: public CBenchBuffer
{
  CBaseRandomGenerator *RG;
public:
  void Set(CBaseRandomGenerator *rg) { RG = rg; }
  UInt32 GetVal(UInt32 &res, int numBits) 
  {
    UInt32 val = res & (((UInt32)1 << numBits) - 1);
    res >>= numBits;
    return val;
  }
  UInt32 GetLen(UInt32 &res) 
  { 
    UInt32 len = GetVal(res, 2);
    return GetVal(res, 1 + len);
  }
  void Generate()
  {
    UInt32 pos = 0;
    UInt32 rep0 = 1;
    while (pos < BufferSize)
    {
      UInt32 res = RG->GetRnd();
      res >>= 1;
      if (GetVal(res, 1) == 0 || pos < 1024)
        Buffer[pos++] = (Byte)(res & 0xFF);
      else
      {
        UInt32 len;
        len = 1 + GetLen(res);
        if (GetVal(res, 3) != 0)
        {
          len += GetLen(res);
          do
          {
            UInt32 ppp = GetVal(res, 5) + 6;
            res = RG->GetRnd();
            if (ppp > 30)
              continue;
            rep0 = /* (1 << ppp) +*/  GetVal(res, ppp);
            res = RG->GetRnd();
          }
          while (rep0 >= pos);
          rep0++;
        }

        for (UInt32 i = 0; i < len && pos < BufferSize; i++, pos++)
          Buffer[pos] = Buffer[pos - rep0];
      }
    }
  }
};


class CBenchmarkInStream: 
  public ISequentialInStream,
  public CMyUnknownImp
{
  const Byte *Data;
  size_t Pos;
  size_t Size;
public:
  MY_UNKNOWN_IMP
  void Init(const Byte *data, size_t size)
  {
    Data = data;
    Size = size;
    Pos = 0;
  }
  STDMETHOD(Read)(void *data, UInt32 size, UInt32 *processedSize);
};

STDMETHODIMP CBenchmarkInStream::Read(void *data, UInt32 size, UInt32 *processedSize)
{
  size_t remain = Size - Pos;
  UInt32 kMaxBlockSize = (1 << 20);
  if (size > kMaxBlockSize)
    size = kMaxBlockSize;
  if (size > remain)
    size = (UInt32)remain;
  for (UInt32 i = 0; i < size; i++)
    ((Byte *)data)[i] = Data[Pos + i];
  Pos += size;
  if(processedSize != NULL)
    *processedSize = size;
  return S_OK;
}
  
class CBenchmarkOutStream: 
  public ISequentialOutStream,
  public CBenchBuffer,
  public CMyUnknownImp
{
  // bool _overflow;
public:
  UInt32 Pos;
  // CBenchmarkOutStream(): _overflow(false) {} 
  void Init() 
  {
    // _overflow = false;
    Pos = 0;
  }
  MY_UNKNOWN_IMP
  STDMETHOD(Write)(const void *data, UInt32 size, UInt32 *processedSize);
};

STDMETHODIMP CBenchmarkOutStream::Write(const void *data, UInt32 size, UInt32 *processedSize)
{
  size_t curSize = BufferSize - Pos;
  if (curSize > size)
    curSize = size;
  memcpy(Buffer + Pos, data, curSize);
  Pos += (UInt32)curSize;
  if(processedSize != NULL)
    *processedSize = (UInt32)curSize;
  if (curSize != size)
  {
    // _overflow = true;
    return E_FAIL;
  }
  return S_OK;
}
  
class CCrcOutStream: 
  public ISequentialOutStream,
  public CMyUnknownImp
{
public:
  UInt32 Crc;
  MY_UNKNOWN_IMP
  void Init() { Crc = CRC_INIT_VAL; }
  STDMETHOD(Write)(const void *data, UInt32 size, UInt32 *processedSize);
};

STDMETHODIMP CCrcOutStream::Write(const void *data, UInt32 size, UInt32 *processedSize)
{
  Crc = CrcUpdate(Crc, data, size);
  if (processedSize != NULL)
    *processedSize = size;
  return S_OK;
}
  
static UInt64 GetTimeCount()
{
  #ifdef USE_POSIX_TIME
  #ifdef USE_POSIX_TIME2
  timeval v;
  if (gettimeofday(&v, 0) == 0)
    return (UInt64)(v.tv_sec) * 1000000 + v.tv_usec;
  return (UInt64)time(NULL) * 1000000;
  #else
  return time(NULL);
  #endif
  #else
  /*
  LARGE_INTEGER value;
  if (::QueryPerformanceCounter(&value))
    return value.QuadPart;
  */
  return GetTickCount();
  #endif 
}

static UInt64 GetFreq()
{
  #ifdef USE_POSIX_TIME
  #ifdef USE_POSIX_TIME2
  return 1000000;
  #else
  return 1;
  #endif 
  #else
  /*
  LARGE_INTEGER value;
  if (::QueryPerformanceFrequency(&value))
    return value.QuadPart;
  */
  return 1000;
  #endif 
}

#ifndef USE_POSIX_TIME
static inline UInt64 GetTime64(const FILETIME &t) { return ((UInt64)t.dwHighDateTime << 32) | t.dwLowDateTime; }
#endif
static UInt64 GetUserTime()
{
  #ifdef USE_POSIX_TIME
  return clock();
  #else
  FILETIME creationTime, exitTime, kernelTime, userTime;
  if (::GetProcessTimes(::GetCurrentProcess(), &creationTime, &exitTime, &kernelTime, &userTime) != 0)
    return GetTime64(userTime) + GetTime64(kernelTime);
  return (UInt64)GetTickCount() * 10000;
  #endif 
}

static UInt64 GetUserFreq()
{
  #ifdef USE_POSIX_TIME
  return CLOCKS_PER_SEC;
  #else
  return 10000000;
  #endif 
}

class CBenchProgressStatus
{
  #ifdef BENCH_MT
  NWindows::NSynchronization::CCriticalSection CS;  
  #endif
public:
  HRESULT Res;
  bool EncodeMode;
  void SetResult(HRESULT res) 
  {
    #ifdef BENCH_MT
    NWindows::NSynchronization::CCriticalSectionLock lock(CS);
    #endif
    Res = res;
  }
  HRESULT GetResult()
  {
    #ifdef BENCH_MT
    NWindows::NSynchronization::CCriticalSectionLock lock(CS);
    #endif
    return Res;
  }
};

class CBenchProgressInfo:
  public ICompressProgressInfo,
  public CMyUnknownImp
{
public:
  CBenchProgressStatus *Status;
  CBenchInfo BenchInfo;
  HRESULT Res;
  IBenchCallback *callback;
  CBenchProgressInfo(): callback(0) {}
  MY_UNKNOWN_IMP
  STDMETHOD(SetRatioInfo)(const UInt64 *inSize, const UInt64 *outSize);
};

void SetStartTime(CBenchInfo &bi)
{
  bi.GlobalFreq = GetFreq();
  bi.UserFreq = GetUserFreq();
  bi.GlobalTime = ::GetTimeCount();
  bi.UserTime = ::GetUserTime();
}

void SetFinishTime(const CBenchInfo &biStart, CBenchInfo &dest)
{
  dest.GlobalFreq = GetFreq();
  dest.UserFreq = GetUserFreq();
  dest.GlobalTime = ::GetTimeCount() - biStart.GlobalTime;
  dest.UserTime = ::GetUserTime() - biStart.UserTime;
}

STDMETHODIMP CBenchProgressInfo::SetRatioInfo(const UInt64 *inSize, const UInt64 *outSize)
{
  HRESULT res = Status->GetResult();
  if (res != S_OK)
    return res;
  if (!callback)
    return res;
  CBenchInfo info = BenchInfo;
  SetFinishTime(BenchInfo, info);
  if (Status->EncodeMode)
  {
    info.UnpackSize = *inSize;
    info.PackSize = *outSize;
    res = callback->SetEncodeResult(info, false);
  }
  else
  {
    info.PackSize = BenchInfo.PackSize + *inSize;
    info.UnpackSize = BenchInfo.UnpackSize + *outSize;
    res = callback->SetDecodeResult(info, false);
  }
  if (res != S_OK)
    Status->SetResult(res);
  return res;
}

static const int kSubBits = 8;

static UInt32 GetLogSize(UInt32 size)
{
  for (int i = kSubBits; i < 32; i++)
    for (UInt32 j = 0; j < (1 << kSubBits); j++)
      if (size <= (((UInt32)1) << i) + (j << (i - kSubBits)))
        return (i << kSubBits) + j;
  return (32 << kSubBits);
}

static void NormalizeVals(UInt64 &v1, UInt64 &v2)
{
  while (v1 > 1000000)
  {
    v1 >>= 1;
    v2 >>= 1;
  }
}

UInt64 GetUsage(const CBenchInfo &info)
{
  UInt64 userTime = info.UserTime;
  UInt64 userFreq = info.UserFreq;
  UInt64 globalTime = info.GlobalTime;
  UInt64 globalFreq = info.GlobalFreq;
  NormalizeVals(userTime, userFreq);
  NormalizeVals(globalFreq, globalTime);
  if (userFreq == 0)
    userFreq = 1;
  if (globalTime == 0)
    globalTime = 1;
  return userTime * globalFreq * 1000000 / userFreq / globalTime;
}

UInt64 GetRatingPerUsage(const CBenchInfo &info, UInt64 rating)
{
  UInt64 userTime = info.UserTime;
  UInt64 userFreq = info.UserFreq;
  UInt64 globalTime = info.GlobalTime;
  UInt64 globalFreq = info.GlobalFreq;
  NormalizeVals(userFreq, userTime);
  NormalizeVals(globalTime, globalFreq);
  if (globalFreq == 0)
    globalFreq = 1;
  if (userTime == 0)
    userTime = 1;
  return userFreq * globalTime / globalFreq *  rating / userTime;
}

static UInt64 MyMultDiv64(UInt64 value, UInt64 elapsedTime, UInt64 freq)
{
  UInt64 elTime = elapsedTime;
  NormalizeVals(freq, elTime);
  if (elTime == 0)
    elTime = 1;
  return value * freq / elTime;
}

UInt64 GetCompressRating(UInt32 dictionarySize, UInt64 elapsedTime, UInt64 freq, UInt64 size)
{
  UInt64 t = GetLogSize(dictionarySize) - (kBenchMinDicLogSize << kSubBits);
  // UInt64 numCommandsForOne = 1000 + ((t * t * 7) >> (2 * kSubBits)); // AMD K8
  UInt64 numCommandsForOne = 870 + ((t * t * 5) >> (2 * kSubBits)); // Intel Core2

  UInt64 numCommands = (UInt64)(size) * numCommandsForOne;
  return MyMultDiv64(numCommands, elapsedTime, freq);
}

UInt64 GetDecompressRating(UInt64 elapsedTime, UInt64 freq, UInt64 outSize, UInt64 inSize, UInt32 numIterations)
{
  // UInt64 numCommands = (inSize * 216 + outSize * 14) * numIterations; // AMD K8
  UInt64 numCommands = (inSize * 220 + outSize * 8) * numIterations; // Intel Core2
  return MyMultDiv64(numCommands, elapsedTime, freq);
}

#ifdef EXTERNAL_LZMA
typedef UInt32 (WINAPI * CreateObjectPointer)(const GUID *clsID, 
    const GUID *interfaceID, void **outObject);
#endif

struct CEncoderInfo;

struct CEncoderInfo
{
  #ifdef BENCH_MT
  NWindows::CThread thread[2];
  #endif
  CMyComPtr<ICompressCoder> encoder;
  CBenchProgressInfo *progressInfoSpec[2];
  CMyComPtr<ICompressProgressInfo> progressInfo[2];
  UInt32 NumIterations;
  #ifdef USE_ALLOCA
  size_t AllocaSize;
  #endif

  struct CDecoderInfo
  {
    CEncoderInfo *Encoder;
    UInt32 DecoderIndex;
    #ifdef USE_ALLOCA
    size_t AllocaSize;
    #endif
    bool CallbackMode;
  };
  CDecoderInfo decodersInfo[2];

  CMyComPtr<ICompressCoder> decoders[2];
  HRESULT Results[2];
  CBenchmarkOutStream *outStreamSpec;
  CMyComPtr<ISequentialOutStream> outStream;
  IBenchCallback *callback;
  UInt32 crc;
  UInt32 kBufferSize;
  UInt32 compressedSize;
  CBenchRandomGenerator rg;
  CBenchmarkOutStream *propStreamSpec;
  CMyComPtr<ISequentialOutStream> propStream;
  HRESULT Init(UInt32 dictionarySize, UInt32 numThreads, CBaseRandomGenerator *rg);
  HRESULT Encode();
  HRESULT Decode(UInt32 decoderIndex);

  CEncoderInfo(): outStreamSpec(0), callback(0), propStreamSpec(0) {}

  #ifdef BENCH_MT
  static THREAD_FUNC_DECL EncodeThreadFunction(void *param)
  {
    CEncoderInfo *encoder = (CEncoderInfo *)param;
    #ifdef USE_ALLOCA
    alloca(encoder->AllocaSize);
    #endif
    HRESULT res = encoder->Encode();
    encoder->Results[0] = res;
    if (res != S_OK)
      encoder->progressInfoSpec[0]->Status->SetResult(res);

    return 0;
  }
  static THREAD_FUNC_DECL DecodeThreadFunction(void *param)
  {
    CDecoderInfo *decoder = (CDecoderInfo *)param;
    #ifdef USE_ALLOCA
    alloca(decoder->AllocaSize);
    #endif
    CEncoderInfo *encoder = decoder->Encoder;
    encoder->Results[decoder->DecoderIndex] = encoder->Decode(decoder->DecoderIndex);
    return 0;
  }

  HRESULT CreateEncoderThread()
  {
    return thread[0].Create(EncodeThreadFunction, this);
  }

  HRESULT CreateDecoderThread(int index, bool callbackMode
      #ifdef USE_ALLOCA
      , size_t allocaSize
      #endif
      )
  {
    CDecoderInfo &decoder = decodersInfo[index];
    decoder.DecoderIndex = index;
    decoder.Encoder = this;
    #ifdef USE_ALLOCA
    decoder.AllocaSize = allocaSize;
    #endif
    decoder.CallbackMode = callbackMode;
    return thread[index].Create(DecodeThreadFunction, &decoder);
  }
  #endif
};

HRESULT CEncoderInfo::Init(UInt32 dictionarySize, UInt32 numThreads, CBaseRandomGenerator *rgLoc)
{
  rg.Set(rgLoc);
  kBufferSize = dictionarySize + kAdditionalSize;
  UInt32 kCompressedBufferSize = (kBufferSize / 2) + kCompressedAdditionalSize;
  if (!rg.Alloc(kBufferSize))
    return E_OUTOFMEMORY;
  rg.Generate();
  crc = CrcCalc(rg.Buffer, rg.BufferSize);

  outStreamSpec = new CBenchmarkOutStream;
  if (!outStreamSpec->Alloc(kCompressedBufferSize))
    return E_OUTOFMEMORY;

  outStream = outStreamSpec;

  propStreamSpec = 0;
  if (!propStream)
  {
    propStreamSpec = new CBenchmarkOutStream;
    propStream = propStreamSpec;
  }
  if (!propStreamSpec->Alloc(kMaxLzmaPropSize))
    return E_OUTOFMEMORY;
  propStreamSpec->Init();
  
  PROPID propIDs[] = 
  { 
    NCoderPropID::kDictionarySize, 
    NCoderPropID::kMultiThread
  };
  const int kNumProps = sizeof(propIDs) / sizeof(propIDs[0]);
  PROPVARIANT properties[kNumProps];
  properties[0].vt = VT_UI4;
  properties[0].ulVal = (UInt32)dictionarySize;

  properties[1].vt = VT_BOOL;
  properties[1].boolVal = (numThreads > 1) ? VARIANT_TRUE : VARIANT_FALSE;

  {
    CMyComPtr<ICompressSetCoderProperties> setCoderProperties;
    RINOK(encoder.QueryInterface(IID_ICompressSetCoderProperties, &setCoderProperties));
    if (!setCoderProperties)
      return E_FAIL;
    RINOK(setCoderProperties->SetCoderProperties(propIDs, properties, kNumProps));

    CMyComPtr<ICompressWriteCoderProperties> writeCoderProperties;
    encoder.QueryInterface(IID_ICompressWriteCoderProperties, &writeCoderProperties);
    if (writeCoderProperties)
    {
      RINOK(writeCoderProperties->WriteCoderProperties(propStream));
    }
  }
  return S_OK;
}

HRESULT CEncoderInfo::Encode()
{
  CBenchmarkInStream *inStreamSpec = new CBenchmarkInStream;
  CMyComPtr<ISequentialInStream> inStream = inStreamSpec;
  inStreamSpec->Init(rg.Buffer, rg.BufferSize);
  outStreamSpec->Init();

  RINOK(encoder->Code(inStream, outStream, 0, 0, progressInfo[0]));
  compressedSize = outStreamSpec->Pos;
  encoder.Release();
  return S_OK;
}

HRESULT CEncoderInfo::Decode(UInt32 decoderIndex)
{
  CBenchmarkInStream *inStreamSpec = new CBenchmarkInStream;
  CMyComPtr<ISequentialInStream> inStream = inStreamSpec;
  CMyComPtr<ICompressCoder> &decoder = decoders[decoderIndex];

  CMyComPtr<ICompressSetDecoderProperties2> compressSetDecoderProperties;
  decoder.QueryInterface(IID_ICompressSetDecoderProperties2, &compressSetDecoderProperties);
  if (!compressSetDecoderProperties)
    return E_FAIL;

  CCrcOutStream *crcOutStreamSpec = new CCrcOutStream;
  CMyComPtr<ISequentialOutStream> crcOutStream = crcOutStreamSpec;
    
  CBenchProgressInfo *pi = progressInfoSpec[decoderIndex];
  pi->BenchInfo.UnpackSize = 0;
  pi->BenchInfo.PackSize = 0;

  for (UInt32 j = 0; j < NumIterations; j++)
  {
    inStreamSpec->Init(outStreamSpec->Buffer, compressedSize);
    crcOutStreamSpec->Init();
    
    RINOK(compressSetDecoderProperties->SetDecoderProperties2(propStreamSpec->Buffer, propStreamSpec->Pos));
    UInt64 outSize = kBufferSize;
    RINOK(decoder->Code(inStream, crcOutStream, 0, &outSize, progressInfo[decoderIndex]));
    if (CRC_GET_DIGEST(crcOutStreamSpec->Crc) != crc)
      return S_FALSE;
    pi->BenchInfo.UnpackSize += kBufferSize;
    pi->BenchInfo.PackSize += compressedSize;
  }
  decoder.Release();
  return S_OK;
}

static const UInt32 kNumThreadsMax = (1 << 16);

struct CBenchEncoders
{
  CEncoderInfo *encoders;
  CBenchEncoders(UInt32 num): encoders(0) { encoders = new CEncoderInfo[num]; }
  ~CBenchEncoders() { delete []encoders; }
};

HRESULT LzmaBench(
  #ifdef EXTERNAL_LZMA
  CCodecs *codecs,
  #endif
  UInt32 numThreads, UInt32 dictionarySize, IBenchCallback *callback)
{
  UInt32 numEncoderThreads = 
    #ifdef BENCH_MT
    (numThreads > 1 ? numThreads / 2 : 1);
    #else
    1;
    #endif
  UInt32 numSubDecoderThreads = 
    #ifdef BENCH_MT
    (numThreads > 1 ? 2 : 1);
    #else
    1;
    #endif
  if (dictionarySize < (1 << kBenchMinDicLogSize) || numThreads < 1 || numEncoderThreads > kNumThreadsMax)
  {
    return E_INVALIDARG;
  }

  CBenchEncoders encodersSpec(numEncoderThreads);
  CEncoderInfo *encoders = encodersSpec.encoders;

  #ifdef EXTERNAL_LZMA
  UString name = L"LZMA";
  #endif

  UInt32 i;
  for (i = 0; i < numEncoderThreads; i++)
  {
    CEncoderInfo &encoder = encoders[i];
    encoder.callback = (i == 0) ? callback : 0;

    #ifdef EXTERNAL_LZMA
    RINOK(codecs->CreateCoder(name, true, encoder.encoder));
    #else
    encoder.encoder = new NCompress::NLZMA::CEncoder;
    #endif
    for (UInt32 j = 0; j < numSubDecoderThreads; j++)
    {
      #ifdef EXTERNAL_LZMA
      RINOK(codecs->CreateCoder(name, false, encoder.decoders[j]));
      #else
      encoder.decoders[j] = new NCompress::NLZMA::CDecoder;
      #endif
    }
  }

  CBaseRandomGenerator rg;
  rg.Init();
  for (i = 0; i < numEncoderThreads; i++)
  {
    RINOK(encoders[i].Init(dictionarySize, numThreads, &rg));
  }

  CBenchProgressStatus status;
  status.Res = S_OK;
  status.EncodeMode = true;

  for (i = 0; i < numEncoderThreads; i++)
  {
    CEncoderInfo &encoder = encoders[i];
    for (int j = 0; j < 2; j++)
    {
      encoder.progressInfo[j] = encoder.progressInfoSpec[j] = new CBenchProgressInfo;
      encoder.progressInfoSpec[j]->Status = &status;
    }
    if (i == 0)
    {
      encoder.progressInfoSpec[0]->callback = callback;
      encoder.progressInfoSpec[0]->BenchInfo.NumIterations = numEncoderThreads;
      SetStartTime(encoder.progressInfoSpec[0]->BenchInfo);
    }

    #ifdef BENCH_MT
    if (numEncoderThreads > 1)
    {
      #ifdef USE_ALLOCA
      encoder.AllocaSize = (i * 16 * 21) & 0x7FF;
      #endif
      RINOK(encoder.CreateEncoderThread())
    }
    else
    #endif
    {
      RINOK(encoder.Encode());
    }
  }
  #ifdef BENCH_MT
  if (numEncoderThreads > 1)
    for (i = 0; i < numEncoderThreads; i++)
      encoders[i].thread[0].Wait();
  #endif

  RINOK(status.Res);

  CBenchInfo info;

  SetFinishTime(encoders[0].progressInfoSpec[0]->BenchInfo, info);
  info.UnpackSize = 0;
  info.PackSize = 0;
  info.NumIterations = 1; // progressInfoSpec->NumIterations;
  for (i = 0; i < numEncoderThreads; i++)
  {
    CEncoderInfo &encoder = encoders[i];
    info.UnpackSize += encoder.kBufferSize;
    info.PackSize += encoder.compressedSize;
  }
  RINOK(callback->SetEncodeResult(info, true));


  status.Res = S_OK;
  status.EncodeMode = false;

  UInt32 numDecoderThreads = numEncoderThreads * numSubDecoderThreads;
  for (i = 0; i < numEncoderThreads; i++)
  {
    CEncoderInfo &encoder = encoders[i];
    encoder.NumIterations = 2 + kUncompressMinBlockSize / encoder.kBufferSize;

    if (i == 0)
    {
      encoder.progressInfoSpec[0]->callback = callback;
      encoder.progressInfoSpec[0]->BenchInfo.NumIterations = numDecoderThreads;
      SetStartTime(encoder.progressInfoSpec[0]->BenchInfo);
    }

    #ifdef BENCH_MT
    if (numDecoderThreads > 1)
    {
      for (UInt32 j = 0; j < numSubDecoderThreads; j++)
      {
        size_t allocaSize = ((i * numSubDecoderThreads + j) * 16 * 21) & 0x7FF;
        HRESULT res = encoder.CreateDecoderThread(j, (i == 0 && j == 0)
            #ifdef USE_ALLOCA
            , allocaSize
            #endif
            );
        RINOK(res);
      }
    }
    else
    #endif
    {
      RINOK(encoder.Decode(0));
    }
  }
  #ifdef BENCH_MT
  HRESULT res = S_OK;
  if (numDecoderThreads > 1)
    for (i = 0; i < numEncoderThreads; i++)
      for (UInt32 j = 0; j < numSubDecoderThreads; j++)
      {
        CEncoderInfo &encoder = encoders[i];
        encoder.thread[j].Wait();
        if (encoder.Results[j] != S_OK)
          res = encoder.Results[j];
      }
  RINOK(res);
  #endif
  RINOK(status.Res);
  SetFinishTime(encoders[0].progressInfoSpec[0]->BenchInfo, info);
  info.UnpackSize = 0;
  info.PackSize = 0;
  info.NumIterations = numSubDecoderThreads * encoders[0].NumIterations;
  for (i = 0; i < numEncoderThreads; i++)
  {
    CEncoderInfo &encoder = encoders[i];
    info.UnpackSize += encoder.kBufferSize;
    info.PackSize += encoder.compressedSize;
  }
  RINOK(callback->SetDecodeResult(info, false));
  RINOK(callback->SetDecodeResult(info, true));
  return S_OK;
}


inline UInt64 GetLZMAUsage(bool multiThread, UInt32 dictionary)
{ 
  UInt32 hs = dictionary - 1;
  hs |= (hs >> 1);
  hs |= (hs >> 2);
  hs |= (hs >> 4);
  hs |= (hs >> 8);
  hs >>= 1;
  hs |= 0xFFFF;
  if (hs > (1 << 24))
    hs >>= 1;
  hs++;
  return ((hs + (1 << 16)) + (UInt64)dictionary * 2) * 4 + (UInt64)dictionary * 3 / 2 + 
      (1 << 20) + (multiThread ? (6 << 20) : 0);
}

UInt64 GetBenchMemoryUsage(UInt32 numThreads, UInt32 dictionary)
{
  const UInt32 kBufferSize = dictionary;
  const UInt32 kCompressedBufferSize = (kBufferSize / 2);
  UInt32 numSubThreads = (numThreads > 1) ? 2 : 1;
  UInt32 numBigThreads = numThreads / numSubThreads;
  return (kBufferSize + kCompressedBufferSize +
    GetLZMAUsage((numThreads > 1), dictionary) + (2 << 20)) * numBigThreads;
}

static bool CrcBig(const void *data, UInt32 size, UInt32 numCycles, UInt32 crcBase)
{
  for (UInt32 i = 0; i < numCycles; i++)
    if (CrcCalc(data, size) != crcBase)
      return false;
  return true;
}

#ifdef BENCH_MT
struct CCrcInfo
{
  NWindows::CThread Thread;
  const Byte *Data;
  UInt32 Size;
  UInt32 NumCycles;
  UInt32 Crc;
  bool Res;
  void Wait()
  {
    Thread.Wait();
    Thread.Close();
  }
};

static THREAD_FUNC_DECL CrcThreadFunction(void *param)
{
  CCrcInfo *p = (CCrcInfo *)param;
  p->Res = CrcBig(p->Data, p->Size, p->NumCycles, p->Crc);
  return 0;
}

struct CCrcThreads
{
  UInt32 NumThreads;
  CCrcInfo *Items;
  CCrcThreads(): Items(0), NumThreads(0) {}
  void WaitAll()
  {
    for (UInt32 i = 0; i < NumThreads; i++)
      Items[i].Wait();
    NumThreads = 0;
  }
  ~CCrcThreads() 
  { 
    WaitAll();
    delete []Items; 
  }
};
#endif

static UInt32 CrcCalc1(const Byte *buf, UInt32 size)
{
  UInt32 crc = CRC_INIT_VAL;;
  for (UInt32 i = 0; i < size; i++)
    crc = CRC_UPDATE_BYTE(crc, buf[i]);
  return CRC_GET_DIGEST(crc);
}

static void RandGen(Byte *buf, UInt32 size, CBaseRandomGenerator &RG)
{
  for (UInt32 i = 0; i < size; i++)
    buf[i] = (Byte)RG.GetRnd();
}

static UInt32 RandGenCrc(Byte *buf, UInt32 size, CBaseRandomGenerator &RG)
{
  RandGen(buf, size, RG);
  return CrcCalc1(buf, size);
}

bool CrcInternalTest()
{
  CBenchBuffer buffer;
  const UInt32 kBufferSize0 = (1 << 8);
  const UInt32 kBufferSize1 = (1 << 10);
  const UInt32 kCheckSize = (1 << 5);
  if (!buffer.Alloc(kBufferSize0 + kBufferSize1))
    return false;
  Byte *buf = buffer.Buffer;
  UInt32 i;
  for (i = 0; i < kBufferSize0; i++)
    buf[i] = (Byte)i;
  UInt32 crc1 = CrcCalc1(buf, kBufferSize0);
  if (crc1 != 0x29058C73)
    return false;
  CBaseRandomGenerator RG;
  RandGen(buf + kBufferSize0, kBufferSize1, RG);
  for (i = 0; i < kBufferSize0 + kBufferSize1 - kCheckSize; i++)
    for (UInt32 j = 0; j < kCheckSize; j++)
      if (CrcCalc1(buf + i, j) != CrcCalc(buf + i, j))
        return false;
  return true;
}

HRESULT CrcBench(UInt32 numThreads, UInt32 bufferSize, UInt64 &speed)
{
  if (numThreads == 0)
    numThreads = 1;

  CBenchBuffer buffer;
  size_t totalSize = (size_t)bufferSize * numThreads;
  if (totalSize / numThreads != bufferSize)
    return E_OUTOFMEMORY;
  if (!buffer.Alloc(totalSize))
    return E_OUTOFMEMORY;

  Byte *buf = buffer.Buffer;
  CBaseRandomGenerator RG;
  UInt32 numCycles = ((UInt32)1 << 30) / ((bufferSize >> 2) + 1) + 1;

  UInt64 timeVal;
  #ifdef BENCH_MT
  CCrcThreads threads;
  if (numThreads > 1)
  {
    threads.Items = new CCrcInfo[numThreads];
    UInt32 i;
    for (i = 0; i < numThreads; i++)
    {
      CCrcInfo &info = threads.Items[i];
      Byte *data = buf + (size_t)bufferSize * i;
      info.Data = data;
      info.NumCycles = numCycles;
      info.Size = bufferSize;
      info.Crc = RandGenCrc(data, bufferSize, RG);
    }
    timeVal = GetTimeCount();
    for (i = 0; i < numThreads; i++)
    {
      CCrcInfo &info = threads.Items[i];
      RINOK(info.Thread.Create(CrcThreadFunction, &info));
      threads.NumThreads++;
    }
    threads.WaitAll();
    for (i = 0; i < numThreads; i++)
      if (!threads.Items[i].Res)
        return S_FALSE;
  }
  else
  #endif
  {
    UInt32 crc = RandGenCrc(buf, bufferSize, RG);
    timeVal = GetTimeCount();
    if (!CrcBig(buf, bufferSize, numCycles, crc))
      return S_FALSE;
  }
  timeVal = GetTimeCount() - timeVal;
  if (timeVal == 0)
    timeVal = 1;

  UInt64 size = (UInt64)numCycles * totalSize;
  speed = MyMultDiv64(size, timeVal, GetFreq());
  return S_OK;
}