1
0
qsdecoder/QuickSyncUtils.cpp
egur 4e33dab63d 0.43:
- Default to D3D9 on supported platforms due to better performance.
- GetCodecName now returns acceleration as part of the string (SW, HW D3D9, HW D3D11)
- Tweaked performance of D3D11 allocator (~50% faster) and copy code. Still much slower than D3D9.

git-svn-id: svn://svn.code.sf.net/p/qsdecoder/code/trunk/IntelQuickSyncDecoder@90 dfccccb7-dd81-40b7-a334-5a7ba89c2b1d
2013-05-24 14:02:20 +00:00

560 lines
17 KiB
C++

/*
* Copyright (c) 2011, INTEL CORPORATION
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without modification,
* are permitted provided that the following conditions are met:
*
* Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
* Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation and/or
* other materials provided with the distribution.
* Neither the name of INTEL CORPORATION nor the names of its contributors may
* be used to endorse or promote products derived from this software without specific
* prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY
* EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
* WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
* IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT,
* INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
* NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA,
* OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
* WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/
#include "stdafx.h"
#include "QuickSync_defs.h"
#include "CodecInfo.h"
#include "QuickSyncUtils.h"
static bool s_SSE4_1_enabled = IsSSE41Enabled();
static bool s_AVX2_enabled = IsAVX2Enabled();
static const
struct
{
// actual implementation
mfxIMPL impl;
// adapter's number
int adapterID;
} implTypes[] =
{
{MFX_IMPL_HARDWARE, 0},
{MFX_IMPL_SOFTWARE, -1},
{MFX_IMPL_HARDWARE2, 1},
{MFX_IMPL_HARDWARE3, 2},
{MFX_IMPL_HARDWARE4, 3}
};
// gpu_memcpy_sse41 is a memcpy style function that copied data very fast from a
// GPU tiled memory (uncached speculative write back memory)
// Performance tip: page offset (12 lsb) of both addresses should be different
// optimally use a 2K offset between them.
void* gpu_memcpy_sse41(void* d, const void* s, size_t size)
{
static const size_t regsInLoop = 2;
if (d == NULL || s == NULL) return NULL;
// If memory is not aligned, use memcpy
bool isAligned = (((size_t)(s) | (size_t)(d)) & 0xF) == 0;
if (!(isAligned && s_SSE4_1_enabled))
{
return memcpy(d, s, size);
}
size_t reminder = size & (regsInLoop * sizeof(__m128i) - 1); // Copy 32 bytes every loop
size_t end = 0;
__m128i xmm0, xmm1; // Will actually use xmm registers
__m128i* pTrg = (__m128i*)d;
__m128i* pTrgEnd = pTrg + ((size - reminder) >> 4);
__m128i* pSrc = (__m128i*)s;
// Make sure source is synced - doesn't hurt if not needed.
_mm_sfence();
while (pTrg < pTrgEnd)
{
// _mm_stream_load_si128 emits the Streaming SIMD Extensions 4 (SSE4.1) instruction MOVNTDQA
// Fastest method for copying GPU RAM. Available since Penryn (45nm Core 2 Duo/Quad)
xmm0 = _mm_stream_load_si128(pSrc);
xmm1 = _mm_stream_load_si128(pSrc + 1);
pSrc += regsInLoop;
// _mm_store_si128 emit the SSE2 intruction MOVDQA (aligned store)
// Note a streaming store instruction (bypass L3 cache) will probably be faster in synthetic
// benchmarks but slower in real world usage as the buffer will be used by the application soon
// and better keep the L3 cache hot.
_mm_store_si128(pTrg , xmm0);
_mm_store_si128(pTrg + 1, xmm1);
_mm_store_si128(pTrg + 1, xmm1); // Not a bug - works 4.5% faster!
_mm_store_si128(pTrg + 1, xmm1);
pTrg += regsInLoop;
}
// Copy in 16 byte steps
if (reminder >= 16)
{
size = reminder;
reminder = size & 15;
end = size >> 4;
for (size_t i = 0; i < end; ++i)
{
pTrg[i] = _mm_stream_load_si128(pSrc + i);
}
}
// Copy last bytes - shouldn't happen as strides are modulu 16
if (reminder)
{
__m128i temp = _mm_stream_load_si128(pSrc + end);
char* ps = (char*)(&temp);
char* pt = (char*)(pTrg + end);
for (size_t i = 0; i < reminder; ++i)
{
pt[i] = ps[i];
}
}
return d;
}
// AVX2 copy function. Available since Haswell (4th Generation Core architecture) on premium models.
#if !defined (AVX2_ENABLED)
void* gpu_memcpy_avx2(void* d, const void* s, size_t size)
{
return gpu_memcpy_sse41(d, s, size);
}
#else //#if defined (AVX2_ENABLED)
void* gpu_memcpy_avx2(void* d, const void* s, size_t size)
{
// Must be exp of 2
static const size_t regsInLoop = 4; //TODO: need to tune this...
if (d == NULL || s == NULL) return NULL;
// If memory is not AVX aligned (32B), use gpu_memcpy_sse41 -->TODO: check actual performance hit
bool isAligned = (((size_t)(s) | (size_t)(d)) & 0x1F) == 0;
if (!(isAligned && s_SSE4_1_enabled))
{
return gpu_memcpy_sse41(d, s, size);
}
size_t reminder = size & (regsInLoop * sizeof(__m256i) - 1); // Copy 64 bytes every loop
size_t end = 0;
__m256i ymm0, ymm1, ymm2, ymm3; // Will actually use ymm registers
__m256i* pTrg = (__m256i*)d;
__m256i* pTrgEnd = pTrg + ((size - reminder) >> 5);
__m256i* pSrc = (__m256i*)s;
// Make sure source is synced - doesn't hurt if not needed.
_mm_sfence();
while (pTrg < pTrgEnd)
{
// _mm256_stream_load_si256 emits the AVX2 instruction VMOVNTDQA
// Available since Haswell (22nm, 4th Generation Core)
ymm0 = _mm256_stream_load_si256(pSrc);
ymm1 = _mm256_stream_load_si256(pSrc + 1);
ymm2 = _mm256_stream_load_si256(pSrc + 2);
ymm3 = _mm256_stream_load_si256(pSrc + 3);
pSrc += regsInLoop;
// _mm256_store_si256 emit the AVX intruction VMOVDQA (aligned store)
// Note a streaming store instruction (bypass L3 cache) will probably be faster in synthetic
// benchmarks but slower in real world usage as the buffer will be used by the application soon
// and better keep the L3 cache hot.
_mm256_store_si256(pTrg , ymm0);
_mm256_store_si256(pTrg + 1, ymm1);
_mm256_store_si256(pTrg + 2, ymm2);
_mm256_store_si256(pTrg + 3, ymm3);
pTrg += regsInLoop;
}
// Copy in 32 byte steps
if (reminder >= 32)
{
size = reminder;
reminder = size & 31;
end = size >> 5;
for (size_t i = 0; i < end; ++i)
{
pTrg[i] = _mm256_stream_load_si256(pSrc + i);
}
}
// Copy last bytes
if (reminder)
{
__m256i temp = _mm256_stream_load_si256(pSrc + end);
char* ps = (char*)(&temp);
char* pt = (char*)(pTrg + end);
for (size_t i = 0; i < reminder; ++i)
{
pt[i] = ps[i];
}
}
return d;
}
#endif // #if defined (AVX2_ENABLED)
#pragma pack(push, 8)
typedef struct tagTHREADNAME_INFO
{
DWORD dwType; // Must be 0x1000.
LPCSTR szName; // Pointer to name (in user addr space).
DWORD dwThreadID; // Thread ID (-1=caller thread).
DWORD dwFlags; // Reserved for future use, must be zero.
} THREADNAME_INFO;
#pragma pack(pop)
/*
Renames current thread. Useful for identifying a thread in VS.
*/
void SetThreadName(LPCSTR szThreadName, DWORD dwThreadID)
{
THREADNAME_INFO info;
info.dwType = 0x1000;
info.szName = szThreadName;
info.dwThreadID = dwThreadID;
info.dwFlags = 0;
__try
{
#define MS_VC_EXCEPTION 0x406D1388
RaiseException(MS_VC_EXCEPTION, 0, sizeof(info)/sizeof(DWORD), (ULONG_PTR*)&info);
}
// Disable static analysis warnings about this code
#pragma warning(push)
#pragma warning(disable: 6312 6322)
__except(EXCEPTION_CONTINUE_EXECUTION)
{
}
#pragma warning(pop)
}
#ifdef _DEBUG
void DebugAssert(const TCHAR* pCondition,const TCHAR* pFileName, int iLine)
{
TCHAR szInfo[2048];
_sntprintf_s(szInfo, sizeof(szInfo), _T("%s \nAt line %d of %s\nContinue? (Cancel to debug)"), pCondition, iLine, pFileName);
int msgId = MessageBox(NULL, szInfo, _T("ASSERT Failed"),
MB_SYSTEMMODAL | MB_ICONHAND | MB_YESNOCANCEL | MB_SETFOREGROUND);
switch (msgId)
{
case IDNO: /* Kill the application */
FatalAppExit(FALSE, _T("Application terminated"));
break;
case IDCANCEL: /* Break into the debugger */
DebugBreak();
break;
case IDYES: /* Ignore assertion continue execution */
break;
}
}
#endif //_DEBUG
// Finds greatest common denominator
mfxU32 GCD(mfxU32 a, mfxU32 b)
{
if (0 == a)
return b;
else if (0 == b)
return a;
mfxU32 a1, b1;
if (a >= b)
{
a1 = a;
b1 = b;
}
else
{
a1 = b;
b1 = a;
}
// a1 >= b1;
mfxU32 r = a1 % b1;
while (0 != r)
{
a1 = b1;
b1 = r;
r = a1 % b1;
}
return b1;
}
mfxStatus PARtoDAR(DWORD parw, DWORD parh, DWORD w, DWORD h, DWORD& darw, DWORD& darh)
{
MSDK_CHECK_ERROR(parw, 0, MFX_ERR_UNDEFINED_BEHAVIOR);
MSDK_CHECK_ERROR(parh, 0, MFX_ERR_UNDEFINED_BEHAVIOR);
MSDK_CHECK_ERROR(w, 0, MFX_ERR_UNDEFINED_BEHAVIOR);
MSDK_CHECK_ERROR(h, 0, MFX_ERR_UNDEFINED_BEHAVIOR);
mfxU32 reduced_w = 0, reduced_h = 0;
mfxU32 gcd = GCD(w, h);
// Divide by greatest common divisor
reduced_w = w / gcd;
reduced_h = h / gcd;
darw = reduced_w * parw;
darh = reduced_h * parh;
gcd = GCD(darw, darh);
darh /= gcd;
darw /= gcd;
return MFX_ERR_NONE;
}
mfxStatus DARtoPAR(mfxU32 darw, mfxU32 darh, mfxU32 w, mfxU32 h, mfxU16& parw, mfxU16& parh)
{
MSDK_CHECK_ERROR(darw, 0, MFX_ERR_UNDEFINED_BEHAVIOR);
MSDK_CHECK_ERROR(darh, 0, MFX_ERR_UNDEFINED_BEHAVIOR);
MSDK_CHECK_ERROR(w, 0, MFX_ERR_UNDEFINED_BEHAVIOR);
MSDK_CHECK_ERROR(h, 0, MFX_ERR_UNDEFINED_BEHAVIOR);
mfxU16 reduced_w = 0, reduced_h = 0;
mfxU32 gcd = GCD(w, h);
// Divide by greatest common divisor to fit into mfxU16
reduced_w = (mfxU16) (w / gcd);
reduced_h = (mfxU16) (h / gcd);
// For mpeg2 we need to set exact values for par (standard supports only dar 4:3, 16:9, 221:100, 1:1)
if (darw * 3 == darh * 4)
{
parw = 4 * reduced_h;
parh = 3 * reduced_w;
}
else if (darw * 9 == darh * 16)
{
parw = 16 * reduced_h;
parh = 9 * reduced_w;
}
else if (darw * 100 == darh * 221)
{
parw = 221 * reduced_h;
parh = 100 * reduced_w;
}
else if (darw == darh)
{
parw = reduced_h;
parh = reduced_w;
}
else
{
parw = (mfxU16)((DOUBLE)(darw * reduced_h) / (darh * reduced_w) * 1000);
parh = 1000;
}
// Reduce ratio
gcd = GCD(parw, parh);
parw /= (mfxU16)gcd;
parh /= (mfxU16)gcd;
return MFX_ERR_NONE;
}
const char* GetCodecName(DWORD codec)
{
switch (codec)
{
case MFX_CODEC_AVC: return "AVC";
case MFX_CODEC_MPEG2: return "MPEG2";
case MFX_CODEC_VC1: return "VC1";
default:
return "Unknown";
}
}
const char* GetProfileName(DWORD codec, DWORD profile)
{
switch (codec)
{
case MFX_CODEC_AVC:
switch (profile)
{
case QS_PROFILE_H264_BASELINE: return "Baseline";
case QS_PROFILE_H264_CONSTRAINED_BASELINE: return "Constrained Baseline";
case QS_PROFILE_H264_MAIN: return "Main";
case QS_PROFILE_H264_EXTENDED: return "Extended";
case QS_PROFILE_H264_HIGH: return "High";
case QS_PROFILE_H264_HIGH_10: return "High 10";
case QS_PROFILE_H264_HIGH_10_INTRA: return "High 10 Intra";
case QS_PROFILE_H264_HIGH_422: return "High 4:2:2";
case QS_PROFILE_H264_HIGH_422_INTRA: return "High 4:2:2 Intra";
case QS_PROFILE_H264_HIGH_444: return "High 4:4:4";
case QS_PROFILE_H264_HIGH_444_PREDICTIVE: return "High 4:4:4 Preditive";
case QS_PROFILE_H264_HIGH_444_INTRA: return "High 4:4:4 Intra";
case QS_PROFILE_H264_CAVLC_444: return "CAVLC 4:4:4 Intra";
case QS_PROFILE_H264_MULTIVIEW_HIGH: return "Multi View High";
case QS_PROFILE_H264_STEREO_HIGH: return "Stereo High";
}
break;
case MFX_CODEC_MPEG2:
switch (profile)
{
case QS_PROFILE_MPEG2_422: return "4:2:2";
case QS_PROFILE_MPEG2_HIGH: return "High";
case QS_PROFILE_MPEG2_SPATIALLY_SCALABLE: return "Spatially Scalable";
case QS_PROFILE_MPEG2_SNR_SCALABLE: return "SNR Scalable";
case QS_PROFILE_MPEG2_MAIN: return "Main";
case QS_PROFILE_MPEG2_SIMPLE: return "Simple";
}
break;
case MFX_CODEC_VC1:
switch (profile)
{
case QS_PROFILE_VC1_SIMPLE: return "Simple";
case QS_PROFILE_VC1_MAIN: return "Main";
case QS_PROFILE_VC1_COMPLEX: return "Complex";
case QS_PROFILE_VC1_ADVANCED: return "Advanced";
}
break;
default: break;
}
return "Unknown";
}
bool IsSSE41Enabled() // for MOVNTDQA instruction
{
int CPUInfo[4];
__cpuid(CPUInfo, 1);
return 0 != (CPUInfo[2] & (1<<19)); // 19th bit of 2nd reg means sse4.1 is enabled
}
bool IsAVX2Enabled() // for VMOVNTDQA
{
int CPUInfo[4];
__cpuid(CPUInfo, 7);
return 0 != (CPUInfo[1] & (1<<5)); // 5th bit of 2nd reg means AVX2 is enabled
}
#define MIN_BUFF_SIZE (1 << 18)
static void* mt_copy(void* d, const void* s, size_t size, Tmemcpy memcpyFunc)
{
MSDK_CHECK_POINTER(d, NULL);
MSDK_CHECK_POINTER(s, NULL);
// Buffer is very small and not worth the effort
if (size < MIN_BUFF_SIZE)
{
return memcpyFunc(d, s, size);
}
size_t blockSize = (size / 2) & ~31; // Make size a multiple of 32 bytes
const size_t offsets[] = { 0, blockSize, size };
Concurrency::parallel_for(0, 2, [d, s, &offsets, memcpyFunc](int i)
{
memcpyFunc((char*)d + offsets[i], (const char*)s + offsets[i], offsets[i + 1] - offsets[i]);
});
return d;
}
void* mt_memcpy(void* d, const void* s, size_t size)
{
return mt_copy(d, s, size, memcpy);
}
void* mt_gpu_memcpy(void* d, const void* s, size_t size)
{
return (s_AVX2_enabled) ?
mt_copy(d, s, size, gpu_memcpy_avx2) :
mt_copy(d, s, size, gpu_memcpy_sse41);
}
int GetIntelAdapterIdD3D9(IDirect3D9* _pd3d)
{
CComPtr<IDirect3D9> pd3d = _pd3d;
if (!pd3d)
{
pd3d = Direct3DCreate9(D3D_SDK_VERSION);
if (!pd3d) return -1;
}
unsigned adapterCount = (int)pd3d->GetAdapterCount();
D3DADAPTER_IDENTIFIER9 adIdentifier;
for (int i = adapterCount-1; i >= 0; --i)
{
HRESULT hr = pd3d->GetAdapterIdentifier(i, 0, &adIdentifier);
if (SUCCEEDED(hr))
{
// Look for Intel's vendor ID (8086h)
if (adIdentifier.VendorId == 0x8086)
return i;
}
}
return -1;
}
// Get the ID of the adapter (GPU) associated with an MSDK session, -1 for SW session
int GetMSDKAdapterNumber(mfxSession session)
{
int adapterNum = -1; // SW
mfxIMPL impl = MFX_IMPL_SOFTWARE; // default in case no HW IMPL is found
// we don't care for error codes in further code; if something goes wrong we fall back to the default adapter
if (session)
{
MFXQueryIMPL(session, &impl);
}
else
{
// an auxiliary session, internal for this function
mfxSession auxSession;
memset(&auxSession, 0, sizeof(auxSession));
mfxVersion ver = {1, 1}; // minimum API version which supports multiple devices
MFXInit(MFX_IMPL_HARDWARE_ANY, &ver, &auxSession);
MFXQueryIMPL(auxSession, &impl);
MFXClose(auxSession);
}
// extract the base implementation type
mfxIMPL baseImpl = MFX_IMPL_BASETYPE(impl);
// get corresponding adapter number
for (mfxU8 i = 0; i < sizeof(implTypes)/sizeof(implTypes[0]); ++i)
{
if (implTypes[i].impl == baseImpl)
{
adapterNum = implTypes[i].adapterID;
break;
}
}
return adapterNum;
}